Spaces:
No application file
No application file
# Copyright 2008-2010, 2012-2014, 2016-2017 by Peter Cock. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Bio.AlignIO support for the "nexus" file format. | |
You are expected to use this module via the Bio.AlignIO functions (or the | |
Bio.SeqIO functions if you want to work directly with the gapped sequences). | |
See also the Bio.Nexus module (which this code calls internally), | |
as this offers more than just accessing the alignment or its | |
sequences as SeqRecord objects. | |
""" | |
from Bio.Align import MultipleSeqAlignment | |
from Bio.AlignIO.Interfaces import AlignmentWriter | |
from Bio.Nexus import Nexus | |
from Bio.SeqRecord import SeqRecord | |
# You can get a couple of example files here: | |
# http://www.molecularevolution.org/resources/fileformats/ | |
# This is a generator function! | |
def NexusIterator(handle, seq_count=None): | |
"""Return SeqRecord objects from a Nexus file. | |
Thus uses the Bio.Nexus module to do the hard work. | |
You are expected to call this function via Bio.SeqIO or Bio.AlignIO | |
(and not use it directly). | |
NOTE - We only expect ONE alignment matrix per Nexus file, | |
meaning this iterator will only yield one MultipleSeqAlignment. | |
""" | |
n = Nexus.Nexus(handle) | |
if not n.matrix: | |
# No alignment found | |
return | |
# Bio.Nexus deals with duplicated names by adding a '.copy' suffix. | |
# The original names and the modified names are kept in these two lists: | |
assert len(n.unaltered_taxlabels) == len(n.taxlabels) | |
if seq_count and seq_count != len(n.unaltered_taxlabels): | |
raise ValueError( | |
"Found %i sequences, but seq_count=%i" | |
% (len(n.unaltered_taxlabels), seq_count) | |
) | |
# TODO - Can we extract any annotation too? | |
if n.datatype in ("dna", "nucleotide"): | |
annotations = {"molecule_type": "DNA"} | |
elif n.datatype == "rna": | |
annotations = {"molecule_type": "RNA"} | |
elif n.datatype == "protein": | |
annotations = {"molecule_type": "protein"} | |
else: | |
annotations = None | |
records = ( | |
SeqRecord( | |
n.matrix[new_name], | |
id=new_name, | |
name=old_name, | |
description="", | |
annotations=annotations, | |
) | |
for old_name, new_name in zip(n.unaltered_taxlabels, n.taxlabels) | |
) | |
# All done | |
yield MultipleSeqAlignment(records) | |
class NexusWriter(AlignmentWriter): | |
"""Nexus alignment writer. | |
Note that Nexus files are only expected to hold ONE alignment | |
matrix. | |
You are expected to call this class via the Bio.AlignIO.write() or | |
Bio.SeqIO.write() functions. | |
""" | |
def write_file(self, alignments): | |
"""Use this to write an entire file containing the given alignments. | |
Arguments: | |
- alignments - A list or iterator returning MultipleSeqAlignment objects. | |
This should hold ONE and only one alignment. | |
""" | |
align_iter = iter(alignments) # Could have been a list | |
try: | |
alignment = next(align_iter) | |
except StopIteration: | |
# Nothing to write! | |
return 0 | |
# Check there is only one alignment... | |
try: | |
next(align_iter) | |
raise ValueError("We can only write one Alignment to a Nexus file.") | |
except StopIteration: | |
pass | |
# Good. Actually write the single alignment, | |
self.write_alignment(alignment) | |
return 1 # we only support writing one alignment! | |
def write_alignment(self, alignment, interleave=None): | |
"""Write an alignment to file. | |
Creates an empty Nexus object, adds the sequences | |
and then gets Nexus to prepare the output. | |
Default interleave behaviour: Interleave if columns > 1000 | |
--> Override with interleave=[True/False] | |
""" | |
if len(alignment) == 0: | |
raise ValueError("Must have at least one sequence") | |
columns = alignment.get_alignment_length() | |
if columns == 0: | |
raise ValueError("Non-empty sequences are required") | |
datatype = self._classify_mol_type_for_nexus(alignment) | |
minimal_record = ( | |
"#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=%s; end;" | |
% datatype | |
) | |
n = Nexus.Nexus(minimal_record) | |
for record in alignment: | |
# Sanity test sequences (should this be even stricter?) | |
if datatype == "dna" and "U" in record.seq: | |
raise ValueError(f"{record.id} contains U, but DNA alignment") | |
elif datatype == "rna" and "T" in record.seq: | |
raise ValueError(f"{record.id} contains T, but RNA alignment") | |
n.add_sequence(record.id, str(record.seq)) | |
# Note: MrBayes may choke on large alignments if not interleaved | |
if interleave is None: | |
interleave = columns > 1000 | |
n.write_nexus_data(self.handle, interleave=interleave) | |
def _classify_mol_type_for_nexus(self, alignment): | |
"""Return 'protein', 'dna', or 'rna' based on records' molecule type (PRIVATE). | |
All the records must have a molecule_type annotation, and they must | |
agree. | |
Raises an exception if this is not possible. | |
""" | |
values = {_.annotations.get("molecule_type", None) for _ in alignment} | |
if all(_ and "DNA" in _ for _ in values): | |
return "dna" # could have been a mix of "DNA" and "gDNA" | |
elif all(_ and "RNA" in _ for _ in values): | |
return "rna" # could have been a mix of "RNA" and "mRNA" | |
elif all(_ and "protein" in _ for _ in values): | |
return "protein" | |
else: | |
raise ValueError("Need the molecule type to be defined") | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest(verbose=0) | |