Spaces:
No application file
No application file
# Copyright 2006-2016 by Peter Cock. All rights reserved. | |
# Revisions copyright 2011 Brandon Invergo. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""AlignIO support for "phylip" format from Joe Felsenstein's PHYLIP tools. | |
You are expected to use this module via the Bio.AlignIO functions (or the | |
Bio.SeqIO functions if you want to work directly with the gapped sequences). | |
Support for "relaxed phylip" format is also provided. Relaxed phylip differs | |
from standard phylip format in the following ways: | |
- No whitespace is allowed in the sequence ID. | |
- No truncation is performed. Instead, sequence IDs are padded to the longest | |
ID length, rather than 10 characters. A space separates the sequence | |
identifier from the sequence. | |
Relaxed phylip is supported by RAxML and PHYML. | |
Note | |
==== | |
In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003) | |
a dot/period (".") in a sequence is interpreted as meaning the same | |
character as in the first sequence. The PHYLIP documentation from 3.3 to 3.69 | |
http://evolution.genetics.washington.edu/phylip/doc/sequence.html says: | |
"a period was also previously allowed but it is no longer allowed, | |
because it sometimes is used in different senses in other programs" | |
Biopython 1.58 or later treats dots/periods in the sequence as invalid, both | |
for reading and writing. Older versions did nothing special with a dot/period. | |
""" | |
import string | |
from Bio.Align import MultipleSeqAlignment | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
from .Interfaces import AlignmentIterator | |
from .Interfaces import SequentialAlignmentWriter | |
_PHYLIP_ID_WIDTH = 10 | |
_NO_DOTS = "PHYLIP format no longer allows dots in sequence" | |
class PhylipWriter(SequentialAlignmentWriter): | |
"""Phylip alignment writer.""" | |
def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH): | |
"""Use this to write (another) single alignment to an open file. | |
This code will write interlaced alignments (when the sequences are | |
longer than 50 characters). | |
Note that record identifiers are strictly truncated to id_width, | |
defaulting to the value required to comply with the PHYLIP standard. | |
For more information on the file format, please see: | |
http://evolution.genetics.washington.edu/phylip/doc/sequence.html | |
http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles | |
""" | |
handle = self.handle | |
if len(alignment) == 0: | |
raise ValueError("Must have at least one sequence") | |
length_of_seqs = alignment.get_alignment_length() | |
for record in alignment: | |
if length_of_seqs != len(record.seq): | |
raise ValueError("Sequences must all be the same length") | |
if length_of_seqs <= 0: | |
raise ValueError("Non-empty sequences are required") | |
# Check for repeated identifiers... | |
# Apply this test *after* cleaning the identifiers | |
names = [] | |
seqs = [] | |
for record in alignment: | |
""" | |
Quoting the PHYLIP version 3.6 documentation: | |
The name should be ten characters in length, filled out to | |
the full ten characters by blanks if shorter. Any printable | |
ASCII/ISO character is allowed in the name, except for | |
parentheses ("(" and ")"), square brackets ("[" and "]"), | |
colon (":"), semicolon (";") and comma (","). If you forget | |
to extend the names to ten characters in length by blanks, | |
the program [i.e. PHYLIP] will get out of synchronization | |
with the contents of the data file, and an error message will | |
result. | |
Note that Tab characters count as only one character in the | |
species names. Their inclusion can cause trouble. | |
""" | |
name = sanitize_name(record.id, id_width) | |
if name in names: | |
raise ValueError( | |
"Repeated name %r (originally %r), possibly due to truncation" | |
% (name, record.id) | |
) | |
names.append(name) | |
sequence = str(record.seq) | |
if "." in sequence: | |
# Do this check here (once per record, not once per block) | |
raise ValueError(_NO_DOTS) | |
seqs.append(sequence) | |
# From experimentation, the use of tabs is not understood by the | |
# EMBOSS suite. The nature of the expected white space is not | |
# defined in the PHYLIP documentation, simply "These are in free | |
# format, separated by blanks". We'll use spaces to keep EMBOSS | |
# happy. | |
handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) | |
block = 0 | |
while True: | |
for name, sequence in zip(names, seqs): | |
if block == 0: | |
# Write name (truncated/padded to id_width characters) | |
# Now truncate and right pad to expected length. | |
handle.write(name[:id_width].ljust(id_width)) | |
else: | |
# write indent | |
handle.write(" " * id_width) | |
# Write five chunks of ten letters per line... | |
for chunk in range(0, 5): | |
i = block * 50 + chunk * 10 | |
seq_segment = sequence[i : i + 10] | |
# TODO - Force any gaps to be '-' character? | |
# TODO - How to cope with '?' or '.' in the sequence? | |
handle.write(f" {seq_segment}") | |
if i + 10 > length_of_seqs: | |
break | |
handle.write("\n") | |
block += 1 | |
if block * 50 >= length_of_seqs: | |
break | |
handle.write("\n") | |
class PhylipIterator(AlignmentIterator): | |
"""Reads a Phylip alignment file returning a MultipleSeqAlignment iterator. | |
Record identifiers are limited to at most 10 characters. | |
It only copes with interlaced phylip files! Sequential files won't work | |
where the sequences are split over multiple lines. | |
For more information on the file format, please see: | |
http://evolution.genetics.washington.edu/phylip/doc/sequence.html | |
http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles | |
""" | |
# Default truncation length | |
id_width = _PHYLIP_ID_WIDTH | |
_header = None # for caching lines between __next__ calls | |
def _is_header(self, line): | |
line = line.strip() | |
parts = [x for x in line.split() if x] | |
if len(parts) != 2: | |
return False # First line should have two integers | |
try: | |
number_of_seqs = int(parts[0]) | |
length_of_seqs = int(parts[1]) | |
return True | |
except ValueError: | |
return False # First line should have two integers | |
def _split_id(self, line): | |
"""Extract the sequence ID from a Phylip line (PRIVATE). | |
Returning a tuple containing: (sequence_id, sequence_residues) | |
The first 10 characters in the line are are the sequence id, the | |
remainder are sequence data. | |
""" | |
seq_id = line[: self.id_width].strip() | |
seq = line[self.id_width :].strip().replace(" ", "") | |
return seq_id, seq | |
def __next__(self): | |
"""Parse the next alignment from the handle.""" | |
handle = self.handle | |
if self._header is None: | |
line = handle.readline() | |
else: | |
# Header we saved from when we were parsing | |
# the previous alignment. | |
line = self._header | |
self._header = None | |
if not line: | |
raise StopIteration | |
line = line.strip() | |
parts = [x for x in line.split() if x] | |
if len(parts) != 2: | |
raise ValueError("First line should have two integers") | |
try: | |
number_of_seqs = int(parts[0]) | |
length_of_seqs = int(parts[1]) | |
except ValueError: | |
raise ValueError("First line should have two integers") from None | |
assert self._is_header(line) | |
if ( | |
self.records_per_alignment is not None | |
and self.records_per_alignment != number_of_seqs | |
): | |
raise ValueError( | |
"Found %i records in this alignment, told to expect %i" | |
% (number_of_seqs, self.records_per_alignment) | |
) | |
ids = [] | |
seqs = [] | |
# By default, expects STRICT truncation / padding to 10 characters. | |
# Does not require any whitespace between name and seq. | |
for i in range(number_of_seqs): | |
line = handle.readline().rstrip() | |
sequence_id, s = self._split_id(line) | |
ids.append(sequence_id) | |
if "." in s: | |
raise ValueError(_NO_DOTS) | |
seqs.append([s]) | |
# Look for further blocks | |
line = "" | |
while True: | |
# Skip any blank lines between blocks... | |
while "" == line.strip(): | |
line = handle.readline() | |
if not line: | |
break # end of file | |
if not line: | |
break # end of file | |
if self._is_header(line): | |
# Looks like the start of a concatenated alignment | |
self._header = line | |
break | |
# print("New block...") | |
for i in range(number_of_seqs): | |
s = line.strip().replace(" ", "") | |
if "." in s: | |
raise ValueError(_NO_DOTS) | |
seqs[i].append(s) | |
line = handle.readline() | |
if (not line) and i + 1 < number_of_seqs: | |
raise ValueError("End of file mid-block") | |
if not line: | |
break # end of file | |
records = ( | |
SeqRecord(Seq("".join(s)), id=i, name=i, description=i) | |
for (i, s) in zip(ids, seqs) | |
) | |
return MultipleSeqAlignment(records) | |
# Relaxed Phylip | |
class RelaxedPhylipWriter(PhylipWriter): | |
"""Relaxed Phylip format writer.""" | |
def write_alignment(self, alignment): | |
"""Write a relaxed phylip alignment.""" | |
# Check inputs | |
for name in (s.id.strip() for s in alignment): | |
if any(c in name for c in string.whitespace): | |
raise ValueError(f"Whitespace not allowed in identifier: {name}") | |
# Calculate a truncation length - maximum length of sequence ID plus a | |
# single character for padding | |
# If no sequences, set id_width to 1. super(...) call will raise a | |
# ValueError | |
if len(alignment) == 0: | |
id_width = 1 | |
else: | |
id_width = max(len(s.id.strip()) for s in alignment) + 1 | |
super().write_alignment(alignment, id_width) | |
class RelaxedPhylipIterator(PhylipIterator): | |
"""Relaxed Phylip format Iterator.""" | |
def _split_id(self, line): | |
"""Extract the sequence ID from a Phylip line (PRIVATE). | |
Returns a tuple containing: (sequence_id, sequence_residues) | |
For relaxed format split at the first whitespace character. | |
""" | |
seq_id, sequence = line.split(None, 1) | |
sequence = sequence.strip().replace(" ", "") | |
return seq_id, sequence | |
class SequentialPhylipWriter(SequentialAlignmentWriter): | |
"""Sequential Phylip format Writer.""" | |
def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH): | |
"""Write a Phylip alignment to the handle.""" | |
handle = self.handle | |
if len(alignment) == 0: | |
raise ValueError("Must have at least one sequence") | |
length_of_seqs = alignment.get_alignment_length() | |
for record in alignment: | |
if length_of_seqs != len(record.seq): | |
raise ValueError("Sequences must all be the same length") | |
if length_of_seqs <= 0: | |
raise ValueError("Non-empty sequences are required") | |
# Check for repeated identifiers... | |
# Apply this test *after* cleaning the identifiers | |
names = [] | |
for record in alignment: | |
# Either remove the banned characters, or map them to something | |
# else like an underscore "_" or pipe "|" character... | |
name = sanitize_name(record.id, id_width) | |
if name in names: | |
raise ValueError( | |
"Repeated name %r (originally %r), possibly due to truncation" | |
% (name, record.id) | |
) | |
names.append(name) | |
# From experimentation, the use of tabs is not understood by the | |
# EMBOSS suite. The nature of the expected white space is not | |
# defined in the PHYLIP documentation, simply "These are in free | |
# format, separated by blanks". We'll use spaces to keep EMBOSS | |
# happy. | |
handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) | |
for name, record in zip(names, alignment): | |
sequence = str(record.seq) | |
if "." in sequence: | |
raise ValueError(_NO_DOTS) | |
handle.write(name[:id_width].ljust(id_width)) | |
# Write the entire sequence to one line (see sequential format | |
# notes in the SequentialPhylipIterator docstring | |
handle.write(sequence) | |
handle.write("\n") | |
class SequentialPhylipIterator(PhylipIterator): | |
"""Sequential Phylip format Iterator. | |
The sequential format carries the same restrictions as the normal | |
interleaved one, with the difference being that the sequences are listed | |
sequentially, each sequence written in its entirety before the start of | |
the next. According to the PHYLIP documentation for input file | |
formatting, newlines and spaces may optionally be entered at any point | |
in the sequences. | |
""" | |
_header = None # for caching lines between __next__ calls | |
def __next__(self): | |
"""Parse the next alignment from the handle.""" | |
handle = self.handle | |
if self._header is None: | |
line = handle.readline() | |
else: | |
# Header we saved from when we were parsing | |
# the previous alignment. | |
line = self._header | |
self._header = None | |
if not line: | |
raise StopIteration | |
line = line.strip() | |
parts = [x for x in line.split() if x] | |
if len(parts) != 2: | |
raise ValueError("First line should have two integers") | |
try: | |
number_of_seqs = int(parts[0]) | |
length_of_seqs = int(parts[1]) | |
except ValueError: | |
raise ValueError("First line should have two integers") from None | |
assert self._is_header(line) | |
if ( | |
self.records_per_alignment is not None | |
and self.records_per_alignment != number_of_seqs | |
): | |
raise ValueError( | |
"Found %i records in this alignment, told to expect %i" | |
% (number_of_seqs, self.records_per_alignment) | |
) | |
ids = [] | |
seqs = [] | |
# By default, expects STRICT truncation / padding to 10 characters. | |
# Does not require any whitespace between name and seq. | |
for i in range(number_of_seqs): | |
line = handle.readline().rstrip() | |
sequence_id, s = self._split_id(line) | |
ids.append(sequence_id) | |
while len(s) < length_of_seqs: | |
# The sequence may be split into multiple lines | |
line = handle.readline().strip() | |
if not line: | |
break | |
if line == "": | |
continue | |
s = "".join([s, line.strip().replace(" ", "")]) | |
if len(s) > length_of_seqs: | |
raise ValueError( | |
"Found a record of length %i, " | |
"should be %i" % (len(s), length_of_seqs) | |
) | |
if "." in s: | |
raise ValueError(_NO_DOTS) | |
seqs.append(s) | |
while True: | |
# Find other alignments in the file | |
line = handle.readline() | |
if not line: | |
break | |
if self._is_header(line): | |
self._header = line | |
break | |
records = ( | |
SeqRecord(Seq(s), id=i, name=i, description=i) for (i, s) in zip(ids, seqs) | |
) | |
return MultipleSeqAlignment(records) | |
def sanitize_name(name, width=None): | |
"""Sanitise sequence identifier for output. | |
Removes the banned characters "[]()" and replaces the characters ":;" | |
with "|". The name is truncated to "width" characters if specified. | |
""" | |
name = name.strip() | |
for char in "[](),": | |
name = name.replace(char, "") | |
for char in ":;": | |
name = name.replace(char, "|") | |
if width is not None: | |
name = name[:width] | |
return name | |