Spaces:
No application file
No application file
# Copyright 2006-2016 by Peter Cock. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Bio.Align support for the alignment format for input files for PHYLIP tools. | |
You are expected to use this module via the Bio.Align functions. | |
""" | |
from Bio.Align import Alignment | |
from Bio.Align import interfaces | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
_PHYLIP_ID_WIDTH = 10 | |
class AlignmentWriter(interfaces.AlignmentWriter): | |
"""Clustalw alignment writer.""" | |
fmt = "PHYLIP" | |
def format_alignment(self, alignment): | |
"""Return a string with a single alignment in the Phylip format.""" | |
names = [] | |
for record in alignment.sequences: | |
try: | |
name = record.id | |
except AttributeError: | |
name = "" | |
else: | |
name = name.strip() | |
for char in "[](),": | |
name = name.replace(char, "") | |
for char in ":;": | |
name = name.replace(char, "|") | |
name = name[:_PHYLIP_ID_WIDTH] | |
names.append(name) | |
lines = [] | |
nseqs, length = alignment.shape | |
if nseqs == 0: | |
raise ValueError("Must have at least one sequence") | |
if length == 0: | |
raise ValueError("Non-empty sequences are required") | |
line = "%d %d\n" % (nseqs, length) | |
lines.append(line) | |
# From experimentation, the use of tabs is not understood by the | |
# EMBOSS suite. The nature of the expected white space is not | |
# defined in the PHYLIP documentation, simply "These are in free | |
# format, separated by blanks". We'll use spaces to keep EMBOSS | |
# happy. | |
for name, sequence in zip(names, alignment): | |
# Write the entire sequence to one line | |
line = name[:_PHYLIP_ID_WIDTH].ljust(_PHYLIP_ID_WIDTH) + sequence + "\n" | |
lines.append(line) | |
return "".join(lines) | |
class AlignmentIterator(interfaces.AlignmentIterator): | |
"""Reads a Phylip alignment file and returns an Alignment iterator. | |
Record names are limited to at most 10 characters. | |
The parser determines from the file contents if the file format is | |
sequential or interleaved, and parses the file accordingly. | |
For more information on the file format, please see: | |
http://evolution.genetics.washington.edu/phylip/doc/sequence.html | |
http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles | |
""" | |
fmt = "PHYLIP" | |
def _read_header(self, stream): | |
try: | |
line = next(stream) | |
except StopIteration: | |
raise ValueError("Empty file.") from None | |
words = line.split() | |
if len(words) == 2: | |
try: | |
self._number_of_seqs = int(words[0]) | |
self._length_of_seqs = int(words[1]) | |
return | |
except ValueError: | |
pass | |
raise ValueError( | |
"Expected two integers in the first line, received '%s'" % line | |
) | |
def _parse_interleaved_first_block(self, lines, seqs, names): | |
for line in lines: | |
line = line.rstrip() | |
name = line[:_PHYLIP_ID_WIDTH].strip() | |
seq = line[_PHYLIP_ID_WIDTH:].strip().replace(" ", "") | |
names.append(name) | |
seqs.append([seq]) | |
def _parse_interleaved_other_blocks(self, stream, seqs): | |
i = 0 | |
for line in stream: | |
line = line.rstrip() | |
if not line: | |
assert i == self._number_of_seqs | |
i = 0 | |
else: | |
seq = line.replace(" ", "") | |
seqs[i].append(seq) | |
i += 1 | |
if i != 0 and i != self._number_of_seqs: | |
raise ValueError("Unexpected file format") | |
def _parse_sequential(self, lines, seqs, names, length): | |
for line in lines: | |
if length == 0: | |
line = line.rstrip() | |
name = line[:_PHYLIP_ID_WIDTH].strip() | |
seq = line[_PHYLIP_ID_WIDTH:].strip() | |
names.append(name) | |
seqs.append([]) | |
else: | |
seq = line.strip() | |
seq = seq.replace(" ", "") | |
seqs[-1].append(seq) | |
length += len(seq) | |
if length == self._length_of_seqs: | |
length = 0 | |
return length | |
def _read_file(self, stream): | |
names = [] | |
seqs = [] | |
lines = [next(stream) for i in range(self._number_of_seqs)] | |
try: | |
line = next(stream) | |
except StopIteration: | |
pass | |
else: | |
if line.rstrip(): | |
# sequential file format | |
lines.append(line) | |
length = self._parse_sequential(lines, seqs, names, 0) | |
self._parse_sequential(stream, seqs, names, length) | |
return names, seqs | |
# interleaved file format | |
self._parse_interleaved_first_block(lines, seqs, names) | |
self._parse_interleaved_other_blocks(stream, seqs) | |
return names, seqs | |
def _read_next_alignment(self, stream): | |
names, seqs = self._read_file(stream) | |
seqs = ["".join(seq) for seq in seqs] | |
if len(seqs) != self._number_of_seqs: | |
raise ValueError( | |
"Found %i records in this alignment, told to expect %i" | |
% (len(seqs), self._number_of_seqs) | |
) | |
for seq in seqs: | |
if len(seq) != self._length_of_seqs: | |
raise ValueError( | |
"Expected all sequences to have length %d; found %d" | |
% (self._length_of_seqs, len(seq)) | |
) | |
if "." in seq: | |
raise ValueError("PHYLIP format no longer allows dots in sequence") | |
coordinates = Alignment.infer_coordinates(seqs) | |
seqs = [seq.replace("-", "") for seq in seqs] | |
records = [ | |
SeqRecord(Seq(seq), id=name, description="") | |
for (name, seq) in zip(names, seqs) | |
] | |
alignment = Alignment(records, coordinates) | |
del self._number_of_seqs | |
del self._length_of_seqs | |
self._close() | |
return alignment | |