aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2006-2016 by Peter Cock. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.Align support for the alignment format for input files for PHYLIP tools.
You are expected to use this module via the Bio.Align functions.
"""
from Bio.Align import Alignment
from Bio.Align import interfaces
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
_PHYLIP_ID_WIDTH = 10
class AlignmentWriter(interfaces.AlignmentWriter):
"""Clustalw alignment writer."""
fmt = "PHYLIP"
def format_alignment(self, alignment):
"""Return a string with a single alignment in the Phylip format."""
names = []
for record in alignment.sequences:
try:
name = record.id
except AttributeError:
name = ""
else:
name = name.strip()
for char in "[](),":
name = name.replace(char, "")
for char in ":;":
name = name.replace(char, "|")
name = name[:_PHYLIP_ID_WIDTH]
names.append(name)
lines = []
nseqs, length = alignment.shape
if nseqs == 0:
raise ValueError("Must have at least one sequence")
if length == 0:
raise ValueError("Non-empty sequences are required")
line = "%d %d\n" % (nseqs, length)
lines.append(line)
# From experimentation, the use of tabs is not understood by the
# EMBOSS suite. The nature of the expected white space is not
# defined in the PHYLIP documentation, simply "These are in free
# format, separated by blanks". We'll use spaces to keep EMBOSS
# happy.
for name, sequence in zip(names, alignment):
# Write the entire sequence to one line
line = name[:_PHYLIP_ID_WIDTH].ljust(_PHYLIP_ID_WIDTH) + sequence + "\n"
lines.append(line)
return "".join(lines)
class AlignmentIterator(interfaces.AlignmentIterator):
"""Reads a Phylip alignment file and returns an Alignment iterator.
Record names are limited to at most 10 characters.
The parser determines from the file contents if the file format is
sequential or interleaved, and parses the file accordingly.
For more information on the file format, please see:
http://evolution.genetics.washington.edu/phylip/doc/sequence.html
http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
"""
fmt = "PHYLIP"
def _read_header(self, stream):
try:
line = next(stream)
except StopIteration:
raise ValueError("Empty file.") from None
words = line.split()
if len(words) == 2:
try:
self._number_of_seqs = int(words[0])
self._length_of_seqs = int(words[1])
return
except ValueError:
pass
raise ValueError(
"Expected two integers in the first line, received '%s'" % line
)
def _parse_interleaved_first_block(self, lines, seqs, names):
for line in lines:
line = line.rstrip()
name = line[:_PHYLIP_ID_WIDTH].strip()
seq = line[_PHYLIP_ID_WIDTH:].strip().replace(" ", "")
names.append(name)
seqs.append([seq])
def _parse_interleaved_other_blocks(self, stream, seqs):
i = 0
for line in stream:
line = line.rstrip()
if not line:
assert i == self._number_of_seqs
i = 0
else:
seq = line.replace(" ", "")
seqs[i].append(seq)
i += 1
if i != 0 and i != self._number_of_seqs:
raise ValueError("Unexpected file format")
def _parse_sequential(self, lines, seqs, names, length):
for line in lines:
if length == 0:
line = line.rstrip()
name = line[:_PHYLIP_ID_WIDTH].strip()
seq = line[_PHYLIP_ID_WIDTH:].strip()
names.append(name)
seqs.append([])
else:
seq = line.strip()
seq = seq.replace(" ", "")
seqs[-1].append(seq)
length += len(seq)
if length == self._length_of_seqs:
length = 0
return length
def _read_file(self, stream):
names = []
seqs = []
lines = [next(stream) for i in range(self._number_of_seqs)]
try:
line = next(stream)
except StopIteration:
pass
else:
if line.rstrip():
# sequential file format
lines.append(line)
length = self._parse_sequential(lines, seqs, names, 0)
self._parse_sequential(stream, seqs, names, length)
return names, seqs
# interleaved file format
self._parse_interleaved_first_block(lines, seqs, names)
self._parse_interleaved_other_blocks(stream, seqs)
return names, seqs
def _read_next_alignment(self, stream):
names, seqs = self._read_file(stream)
seqs = ["".join(seq) for seq in seqs]
if len(seqs) != self._number_of_seqs:
raise ValueError(
"Found %i records in this alignment, told to expect %i"
% (len(seqs), self._number_of_seqs)
)
for seq in seqs:
if len(seq) != self._length_of_seqs:
raise ValueError(
"Expected all sequences to have length %d; found %d"
% (self._length_of_seqs, len(seq))
)
if "." in seq:
raise ValueError("PHYLIP format no longer allows dots in sequence")
coordinates = Alignment.infer_coordinates(seqs)
seqs = [seq.replace("-", "") for seq in seqs]
records = [
SeqRecord(Seq(seq), id=name, description="")
for (name, seq) in zip(names, seqs)
]
alignment = Alignment(records, coordinates)
del self._number_of_seqs
del self._length_of_seqs
self._close()
return alignment