File size: 27,263 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
# Copyright 2010 by Thomas Schmitt.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO support for the "seqxml" file format, SeqXML.

This module is for reading and writing SeqXML format files as
SeqRecord objects, and is expected to be used via the Bio.SeqIO API.

SeqXML is a lightweight XML format which is supposed be an alternative for
FASTA files. For more Information see http://www.seqXML.org and Schmitt et al
(2011), https://doi.org/10.1093/bib/bbr025
"""
from xml import sax
from xml.sax import handler
from xml.sax.saxutils import XMLGenerator
from xml.sax.xmlreader import AttributesImpl

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from .Interfaces import SequenceIterator
from .Interfaces import SequenceWriter


class ContentHandler(handler.ContentHandler):
    """Handles XML events generated by the parser (PRIVATE)."""

    def __init__(self):
        """Create a handler to handle XML events."""
        super().__init__()
        self.source = None
        self.sourceVersion = None
        self.seqXMLversion = None
        self.ncbiTaxID = None
        self.speciesName = None
        self.startElementNS = None
        self.data = None
        self.records = []

    def startDocument(self):
        """Set XML handlers when an XML declaration is found."""
        self.startElementNS = self.startSeqXMLElement

    def startSeqXMLElement(self, name, qname, attrs):
        """Handle start of a seqXML element."""
        if name != (None, "seqXML"):
            raise ValueError("Failed to find the start of seqXML element")
        if qname is not None:
            raise RuntimeError("Unexpected qname for seqXML element")
        schema = None
        for key, value in attrs.items():
            namespace, localname = key
            if namespace is None:
                if localname == "source":
                    self.source = value
                elif localname == "sourceVersion":
                    self.sourceVersion = value
                elif localname == "seqXMLversion":
                    self.seqXMLversion = value
                elif localname == "ncbiTaxID":
                    # check if it is an integer, but store as string
                    number = int(value)
                    self.ncbiTaxID = value
                elif localname == "speciesName":
                    self.speciesName = value
                else:
                    raise ValueError("Unexpected attribute for XML Schema")
            elif namespace == "http://www.w3.org/2001/XMLSchema-instance":
                if localname == "noNamespaceSchemaLocation":
                    schema = value
                else:
                    raise ValueError("Unexpected attribute for XML Schema in namespace")
            else:
                raise ValueError(
                    f"Unexpected namespace '{namespace}' for seqXML attribute"
                )
        if self.seqXMLversion is None:
            raise ValueError("Failed to find seqXMLversion")
        url = f"http://www.seqxml.org/{self.seqXMLversion}/seqxml.xsd"
        if schema != url:
            raise ValueError(
                "XML Schema '%s' found not consistent with reported seqXML version %s"
                % (schema, self.seqXMLversion)
            )
        self.endElementNS = self.endSeqXMLElement
        self.startElementNS = self.startEntryElement

    def endSeqXMLElement(self, name, qname):
        """Handle end of the seqXML element."""
        namespace, localname = name
        if namespace is not None:
            raise RuntimeError(f"Unexpected namespace '{namespace}' for seqXML end")
        if qname is not None:
            raise RuntimeError(f"Unexpected qname '{qname}' for seqXML end")
        if localname != "seqXML":
            raise RuntimeError("Failed to find end of seqXML element")
        self.startElementNS = None
        self.endElementNS = None

    def startEntryElement(self, name, qname, attrs):
        """Set new entry with id and the optional entry source (PRIVATE)."""
        if name != (None, "entry"):
            raise ValueError("Expected to find the start of an entry element")
        if qname is not None:
            raise RuntimeError("Unexpected qname for entry element")
        record = SeqRecord("", id=None)
        if self.speciesName is not None:
            record.annotations["organism"] = self.speciesName
        if self.ncbiTaxID is not None:
            record.annotations["ncbi_taxid"] = self.ncbiTaxID
        record.annotations["source"] = self.source
        for key, value in attrs.items():
            namespace, localname = key
            if namespace is None:
                if localname == "id":
                    record.id = value
                elif localname == "source":
                    record.annotations["source"] = value
                else:
                    raise ValueError(
                        f"Unexpected attribute {localname} in entry element"
                    )
            else:
                raise ValueError(
                    f"Unexpected namespace '{namespace}' for entry attribute"
                )
        if record.id is None:
            raise ValueError("Failed to find entry ID")
        self.records.append(record)
        self.startElementNS = self.startEntryFieldElement
        self.endElementNS = self.endEntryElement

    def endEntryElement(self, name, qname):
        """Handle end of an entry element."""
        if name != (None, "entry"):
            raise ValueError("Expected to find the end of an entry element")
        if qname is not None:
            raise RuntimeError("Unexpected qname for entry element")
        self.startElementNS = self.startEntryElement
        self.endElementNS = self.endSeqXMLElement

    def startEntryFieldElement(self, name, qname, attrs):
        """Receive a field of an entry element and forward it."""
        namespace, localname = name
        if namespace is not None:
            raise ValueError(
                f"Unexpected namespace '{namespace}' for {localname} element"
            )
        if qname is not None:
            raise RuntimeError(f"Unexpected qname '{qname}' for {localname} element")
        if localname == "species":
            return self.startSpeciesElement(attrs)
        if localname == "description":
            return self.startDescriptionElement(attrs)
        if localname in ("DNAseq", "RNAseq", "AAseq"):
            return self.startSequenceElement(attrs)
        if localname == "DBRef":
            return self.startDBRefElement(attrs)
        if localname == "property":
            return self.startPropertyElement(attrs)
        raise ValueError(f"Unexpected field {localname} in entry")

    def startSpeciesElement(self, attrs):
        """Parse the species information."""
        name = None
        ncbiTaxID = None
        for key, value in attrs.items():
            namespace, localname = key
            if namespace is None:
                if localname == "name":
                    name = value
                elif localname == "ncbiTaxID":
                    # check if it is an integer, but store as string
                    number = int(value)
                    ncbiTaxID = value
                else:
                    raise ValueError(
                        f"Unexpected attribute '{key}' found in species tag"
                    )
            else:
                raise ValueError(
                    f"Unexpected namespace '{namespace}' for species attribute"
                )
        # The attributes "name" and "ncbiTaxID" are required:
        if name is None:
            raise ValueError("Failed to find species name")
        if ncbiTaxID is None:
            raise ValueError("Failed to find ncbiTaxId")
        record = self.records[-1]
        # The keywords for the species annotation are taken from SwissIO
        record.annotations["organism"] = name
        # TODO - Should have been a list to match SwissProt parser:
        record.annotations["ncbi_taxid"] = ncbiTaxID
        self.endElementNS = self.endSpeciesElement

    def endSpeciesElement(self, name, qname):
        """Handle end of a species element."""
        namespace, localname = name
        if namespace is not None:
            raise RuntimeError(f"Unexpected namespace '{namespace}' for species end")
        if qname is not None:
            raise RuntimeError(f"Unexpected qname '{qname}' for species end")
        if localname != "species":
            raise RuntimeError("Failed to find end of species element")
        self.endElementNS = self.endEntryElement

    def startDescriptionElement(self, attrs):
        """Parse the description."""
        if attrs:
            raise ValueError("Unexpected attributes found in description element")
        if self.data is not None:
            raise RuntimeError(f"Unexpected data found: '{self.data}'")
        self.data = ""
        self.endElementNS = self.endDescriptionElement

    def endDescriptionElement(self, name, qname):
        """Handle the end of a description element."""
        namespace, localname = name
        if namespace is not None:
            raise RuntimeError(
                f"Unexpected namespace '{namespace}' for description end"
            )
        if qname is not None:
            raise RuntimeError(f"Unexpected qname '{qname}' for description end")
        if localname != "description":
            raise RuntimeError("Failed to find end of description element")
        record = self.records[-1]
        description = self.data
        if description:  # ignore if empty string
            record.description = description
        self.data = None
        self.endElementNS = self.endEntryElement

    def startSequenceElement(self, attrs):
        """Parse DNA, RNA, or protein sequence."""
        if attrs:
            raise ValueError("Unexpected attributes found in sequence element")
        if self.data is not None:
            raise RuntimeError(f"Unexpected data found: '{self.data}'")
        self.data = ""
        self.endElementNS = self.endSequenceElement

    def endSequenceElement(self, name, qname):
        """Handle the end of a sequence element."""
        namespace, localname = name
        if namespace is not None:
            raise RuntimeError(f"Unexpected namespace '{namespace}' for sequence end")
        if qname is not None:
            raise RuntimeError(f"Unexpected qname '{qname}' for sequence end")
        record = self.records[-1]
        if localname == "DNAseq":
            record.annotations["molecule_type"] = "DNA"
        elif localname == "RNAseq":
            record.annotations["molecule_type"] = "RNA"
        elif localname == "AAseq":
            record.annotations["molecule_type"] = "protein"
        else:
            raise RuntimeError(
                f"Failed to find end of sequence (localname = {localname})"
            )
        record.seq = Seq(self.data)
        self.data = None
        self.endElementNS = self.endEntryElement

    def startDBRefElement(self, attrs):
        """Parse a database cross reference."""
        source = None
        ID = None
        for key, value in attrs.items():
            namespace, localname = key
            if namespace is None:
                if localname == "source":
                    source = value
                elif localname == "id":
                    ID = value
                else:
                    raise ValueError(
                        f"Unexpected attribute '{key}' found for DBRef element"
                    )
            else:
                raise ValueError(
                    f"Unexpected namespace '{namespace}' for DBRef attribute"
                )
        # The attributes "source" and "id" are required:
        if source is None:
            raise ValueError("Failed to find source for DBRef element")
        if ID is None:
            raise ValueError("Failed to find id for DBRef element")
        if self.data is not None:
            raise RuntimeError(f"Unexpected data found: '{self.data}'")
        self.data = ""
        record = self.records[-1]
        dbxref = f"{source}:{ID}"
        if dbxref not in record.dbxrefs:
            record.dbxrefs.append(dbxref)
        self.endElementNS = self.endDBRefElement

    def endDBRefElement(self, name, qname):
        """Handle the end of a DBRef element."""
        namespace, localname = name
        if namespace is not None:
            raise RuntimeError(f"Unexpected namespace '{namespace}' for DBRef element")
        if qname is not None:
            raise RuntimeError(f"Unexpected qname '{qname}' for DBRef element")
        if localname != "DBRef":
            raise RuntimeError(f"Unexpected localname '{localname}' for DBRef element")
        if self.data:
            raise RuntimeError(
                f"Unexpected data received for DBRef element: '{self.data}'"
            )
        self.data = None
        self.endElementNS = self.endEntryElement

    def startPropertyElement(self, attrs):
        """Handle the start of a property element."""
        property_name = None
        property_value = None
        for key, value in attrs.items():
            namespace, localname = key
            if namespace is None:
                if localname == "name":
                    property_name = value
                elif localname == "value":
                    property_value = value
                else:
                    raise ValueError(
                        "Unexpected attribute '%s' found for property element", key
                    )
            else:
                raise ValueError(
                    f"Unexpected namespace '{namespace}' for property attribute"
                )
        # The attribute "name" is required:
        if property_name is None:
            raise ValueError("Failed to find name for property element")
        record = self.records[-1]
        if property_name == "molecule_type":
            # At this point, record.annotations["molecule_type"] is either
            # "DNA", "RNA", or "protein"; property_value may be a more detailed
            # description such as "mRNA" or "genomic DNA".
            assert record.annotations[property_name] in property_value
            record.annotations[property_name] = property_value
        else:
            if property_name not in record.annotations:
                record.annotations[property_name] = []
            record.annotations[property_name].append(property_value)
        self.endElementNS = self.endPropertyElement

    def endPropertyElement(self, name, qname):
        """Handle the end of a property element."""
        namespace, localname = name
        if namespace is not None:
            raise RuntimeError(
                f"Unexpected namespace '{namespace}' for property element"
            )
        if qname is not None:
            raise RuntimeError(f"Unexpected qname '{qname}' for property element")
        if localname != "property":
            raise RuntimeError(
                f"Unexpected localname '{localname}' for property element"
            )
        self.endElementNS = self.endEntryElement

    def characters(self, data):
        """Handle character data."""
        if self.data is not None:
            self.data += data


class SeqXmlIterator(SequenceIterator):
    """Parser for seqXML files.

    Parses seqXML files and creates SeqRecords.
    Assumes valid seqXML please validate beforehand.
    It is assumed that all information for one record can be found within a
    record element or above. Two types of methods are called when the start
    tag of an element is reached. To receive only the attributes of an
    element before its end tag is reached implement _attr_TAGNAME.
    To get an element and its children as a DOM tree implement _elem_TAGNAME.
    Everything that is part of the DOM tree will not trigger any further
    method calls.
    """

    BLOCK = 1024

    def __init__(self, stream_or_path, namespace=None):
        """Create the object and initialize the XML parser."""
        # Make sure we got a binary handle. If we got a text handle, then
        # the parser will still run but unicode characters will be garbled
        # if the text handle was opened with a different encoding than the
        # one specified in the XML file. With a binary handle, the correct
        # encoding is picked up by the parser from the XML file.
        self.parser = sax.make_parser()
        content_handler = ContentHandler()
        self.parser.setContentHandler(content_handler)
        self.parser.setFeature(handler.feature_namespaces, True)
        super().__init__(stream_or_path, mode="b", fmt="SeqXML")

    def parse(self, handle):
        """Start parsing the file, and return a SeqRecord generator."""
        parser = self.parser
        content_handler = parser.getContentHandler()
        BLOCK = self.BLOCK
        while True:
            # Read in another block of the file...
            text = handle.read(BLOCK)
            if not text:
                if content_handler.startElementNS is None:
                    raise ValueError("Empty file.")
                else:
                    raise ValueError("XML file contains no data.")
            parser.feed(text)
            seqXMLversion = content_handler.seqXMLversion
            if seqXMLversion is not None:
                break
        self.seqXMLversion = seqXMLversion
        self.source = content_handler.source
        self.sourceVersion = content_handler.sourceVersion
        self.ncbiTaxID = content_handler.ncbiTaxID
        self.speciesName = content_handler.speciesName
        records = self.iterate(handle)
        return records

    def iterate(self, handle):
        """Iterate over the records in the XML file."""
        parser = self.parser
        content_handler = parser.getContentHandler()
        records = content_handler.records
        BLOCK = self.BLOCK
        while True:
            if len(records) > 1:
                # Then at least the first record is finished
                record = records.pop(0)
                yield record
            # Read in another block of the file...
            text = handle.read(BLOCK)
            if not text:
                break
            parser.feed(text)
        # We have reached the end of the XML file;
        # send out the remaining records
        yield from records
        records.clear()
        parser.close()


class SeqXmlWriter(SequenceWriter):
    """Writes SeqRecords into seqXML file.

    SeqXML requires the SeqRecord annotations to specify the molecule_type;
    the molecule type is required to contain the term "DNA", "RNA", or
    "protein".
    """

    def __init__(
        self, target, source=None, source_version=None, species=None, ncbiTaxId=None
    ):
        """Create Object and start the xml generator.

        Arguments:
         - target - Output stream opened in binary mode, or a path to a file.
         - source - The source program/database of the file, for example
           UniProt.
         - source_version - The version or release number of the source
           program or database from which the data originated.
         - species - The scientific name of the species of origin of all
           entries in the file.
         - ncbiTaxId - The NCBI taxonomy identifier of the species of origin.

        """
        super().__init__(target, "wb")
        handle = self.handle
        self.xml_generator = XMLGenerator(handle, "utf-8")
        self.xml_generator.startDocument()
        self.source = source
        self.source_version = source_version
        self.species = species
        self.ncbiTaxId = ncbiTaxId

    def write_header(self):
        """Write root node with document metadata."""
        attrs = {
            "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
            "xsi:noNamespaceSchemaLocation": "http://www.seqxml.org/0.4/seqxml.xsd",
            "seqXMLversion": "0.4",
        }

        if self.source is not None:
            attrs["source"] = self.source
        if self.source_version is not None:
            attrs["sourceVersion"] = self.source_version
        if self.species is not None:
            if not isinstance(self.species, str):
                raise TypeError("species should be of type string")
            attrs["speciesName"] = self.species
        if self.ncbiTaxId is not None:
            if not isinstance(self.ncbiTaxId, (str, int)):
                raise TypeError("ncbiTaxID should be of type string or int")
            attrs["ncbiTaxID"] = self.ncbiTaxId

        self.xml_generator.startElement("seqXML", AttributesImpl(attrs))

    def write_record(self, record):
        """Write one record."""
        if not record.id or record.id == "<unknown id>":
            raise ValueError("SeqXML requires identifier")

        if not isinstance(record.id, str):
            raise TypeError("Identifier should be of type string")

        attrb = {"id": record.id}

        if (
            "source" in record.annotations
            and self.source != record.annotations["source"]
        ):
            if not isinstance(record.annotations["source"], str):
                raise TypeError("source should be of type string")
            attrb["source"] = record.annotations["source"]

        self.xml_generator.startElement("entry", AttributesImpl(attrb))
        self._write_species(record)
        self._write_description(record)
        self._write_seq(record)
        self._write_dbxrefs(record)
        self._write_properties(record)
        self.xml_generator.endElement("entry")

    def write_footer(self):
        """Close the root node and finish the XML document."""
        self.xml_generator.endElement("seqXML")
        self.xml_generator.endDocument()

    def _write_species(self, record):
        """Write the species if given (PRIVATE)."""
        local_ncbi_taxid = None
        if "ncbi_taxid" in record.annotations:
            local_ncbi_taxid = record.annotations["ncbi_taxid"]
            if isinstance(local_ncbi_taxid, list):
                # SwissProt parser uses a list (which could cope with chimeras)
                if len(local_ncbi_taxid) == 1:
                    local_ncbi_taxid = local_ncbi_taxid[0]
                elif len(local_ncbi_taxid) == 0:
                    local_ncbi_taxid = None
                else:
                    raise ValueError(
                        "Multiple entries for record.annotations['ncbi_taxid'], %r"
                        % local_ncbi_taxid
                    )
        if "organism" in record.annotations and local_ncbi_taxid:
            local_org = record.annotations["organism"]

            if not isinstance(local_org, str):
                raise TypeError("organism should be of type string")

            if not isinstance(local_ncbi_taxid, (str, int)):
                raise TypeError("ncbiTaxID should be of type string or int")

            # The local species definition is only written if it differs from the global species definition
            if local_org != self.species or local_ncbi_taxid != self.ncbiTaxId:

                attr = {"name": local_org, "ncbiTaxID": str(local_ncbi_taxid)}
                self.xml_generator.startElement("species", AttributesImpl(attr))
                self.xml_generator.endElement("species")

    def _write_description(self, record):
        """Write the description if given (PRIVATE)."""
        if record.description:

            if not isinstance(record.description, str):
                raise TypeError("Description should be of type string")

            description = record.description
            if description == "<unknown description>":
                description = ""

            if len(record.description) > 0:
                self.xml_generator.startElement("description", AttributesImpl({}))
                self.xml_generator.characters(description)
                self.xml_generator.endElement("description")

    def _write_seq(self, record):
        """Write the sequence (PRIVATE).

        Note that SeqXML requires the molecule type to contain the term
        "DNA", "RNA", or "protein".
        """
        seq = bytes(record.seq)

        if not len(seq) > 0:
            raise ValueError("The sequence length should be greater than 0")

        molecule_type = record.annotations.get("molecule_type")
        if molecule_type is None:
            raise ValueError("molecule_type is not defined")
        elif "DNA" in molecule_type:
            seqElem = "DNAseq"
        elif "RNA" in molecule_type:
            seqElem = "RNAseq"
        elif "protein" in molecule_type:
            seqElem = "AAseq"
        else:
            raise ValueError(f"unknown molecule_type '{molecule_type}'")

        self.xml_generator.startElement(seqElem, AttributesImpl({}))
        self.xml_generator.characters(seq)
        self.xml_generator.endElement(seqElem)

    def _write_dbxrefs(self, record):
        """Write all database cross references (PRIVATE)."""
        if record.dbxrefs is not None:

            for dbxref in record.dbxrefs:

                if not isinstance(dbxref, str):
                    raise TypeError("dbxrefs should be of type list of string")
                if dbxref.find(":") < 1:
                    raise ValueError(
                        "dbxrefs should be in the form ['source:id', 'source:id' ]"
                    )

                dbsource, dbid = dbxref.split(":", 1)

                attr = {"source": dbsource, "id": dbid}
                self.xml_generator.startElement("DBRef", AttributesImpl(attr))
                self.xml_generator.endElement("DBRef")

    def _write_properties(self, record):
        """Write all annotations that are key value pairs with values of a primitive type or list of primitive types (PRIVATE)."""
        for key, value in record.annotations.items():

            if key not in ("organism", "ncbi_taxid", "source"):

                if value is None:

                    attr = {"name": key}
                    self.xml_generator.startElement("property", AttributesImpl(attr))
                    self.xml_generator.endElement("property")

                elif isinstance(value, list):

                    for v in value:
                        if v is None:
                            attr = {"name": key}
                        else:
                            attr = {"name": key, "value": str(v)}
                        self.xml_generator.startElement(
                            "property", AttributesImpl(attr)
                        )
                        self.xml_generator.endElement("property")

                elif isinstance(value, (int, float, str)):

                    attr = {"name": key, "value": str(value)}
                    self.xml_generator.startElement("property", AttributesImpl(attr))
                    self.xml_generator.endElement("property")