Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

File size: 6,441 Bytes

b7731cd

# Copyright 2006-2016 by Peter Cock.  All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.Align support for the alignment format for input files for PHYLIP tools.

You are expected to use this module via the Bio.Align functions.
"""
from Bio.Align import Alignment
from Bio.Align import interfaces
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


_PHYLIP_ID_WIDTH = 10


class AlignmentWriter(interfaces.AlignmentWriter):
    """Clustalw alignment writer."""

    fmt = "PHYLIP"

    def format_alignment(self, alignment):
        """Return a string with a single alignment in the Phylip format."""
        names = []
        for record in alignment.sequences:
            try:
                name = record.id
            except AttributeError:
                name = ""
            else:
                name = name.strip()
                for char in "[](),":
                    name = name.replace(char, "")
                for char in ":;":
                    name = name.replace(char, "|")
                name = name[:_PHYLIP_ID_WIDTH]
            names.append(name)

        lines = []
        nseqs, length = alignment.shape
        if nseqs == 0:
            raise ValueError("Must have at least one sequence")
        if length == 0:
            raise ValueError("Non-empty sequences are required")
        line = "%d %d\n" % (nseqs, length)
        lines.append(line)

        # From experimentation, the use of tabs is not understood by the
        # EMBOSS suite.  The nature of the expected white space is not
        # defined in the PHYLIP documentation, simply "These are in free
        # format, separated by blanks".  We'll use spaces to keep EMBOSS
        # happy.
        for name, sequence in zip(names, alignment):
            # Write the entire sequence to one line
            line = name[:_PHYLIP_ID_WIDTH].ljust(_PHYLIP_ID_WIDTH) + sequence + "\n"
            lines.append(line)
        return "".join(lines)


class AlignmentIterator(interfaces.AlignmentIterator):
    """Reads a Phylip alignment file and returns an Alignment iterator.

    Record names are limited to at most 10 characters.

    The parser determines from the file contents if the file format is
    sequential or interleaved, and parses the file accordingly.

    For more information on the file format, please see:
    http://evolution.genetics.washington.edu/phylip/doc/sequence.html
    http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
    """

    fmt = "PHYLIP"

    def _read_header(self, stream):
        try:
            line = next(stream)
        except StopIteration:
            raise ValueError("Empty file.") from None

        words = line.split()
        if len(words) == 2:
            try:
                self._number_of_seqs = int(words[0])
                self._length_of_seqs = int(words[1])
                return
            except ValueError:
                pass
        raise ValueError(
            "Expected two integers in the first line, received '%s'" % line
        )

    def _parse_interleaved_first_block(self, lines, seqs, names):
        for line in lines:
            line = line.rstrip()
            name = line[:_PHYLIP_ID_WIDTH].strip()
            seq = line[_PHYLIP_ID_WIDTH:].strip().replace(" ", "")
            names.append(name)
            seqs.append([seq])

    def _parse_interleaved_other_blocks(self, stream, seqs):
        i = 0
        for line in stream:
            line = line.rstrip()
            if not line:
                assert i == self._number_of_seqs
                i = 0
            else:
                seq = line.replace(" ", "")
                seqs[i].append(seq)
                i += 1
        if i != 0 and i != self._number_of_seqs:
            raise ValueError("Unexpected file format")

    def _parse_sequential(self, lines, seqs, names, length):
        for line in lines:
            if length == 0:
                line = line.rstrip()
                name = line[:_PHYLIP_ID_WIDTH].strip()
                seq = line[_PHYLIP_ID_WIDTH:].strip()
                names.append(name)
                seqs.append([])
            else:
                seq = line.strip()
            seq = seq.replace(" ", "")
            seqs[-1].append(seq)
            length += len(seq)
            if length == self._length_of_seqs:
                length = 0
        return length

    def _read_file(self, stream):
        names = []
        seqs = []
        lines = [next(stream) for i in range(self._number_of_seqs)]
        try:
            line = next(stream)
        except StopIteration:
            pass
        else:
            if line.rstrip():
                # sequential file format
                lines.append(line)
                length = self._parse_sequential(lines, seqs, names, 0)
                self._parse_sequential(stream, seqs, names, length)
                return names, seqs
        # interleaved file format
        self._parse_interleaved_first_block(lines, seqs, names)
        self._parse_interleaved_other_blocks(stream, seqs)
        return names, seqs

    def _read_next_alignment(self, stream):
        names, seqs = self._read_file(stream)

        seqs = ["".join(seq) for seq in seqs]
        if len(seqs) != self._number_of_seqs:
            raise ValueError(
                "Found %i records in this alignment, told to expect %i"
                % (len(seqs), self._number_of_seqs)
            )
        for seq in seqs:
            if len(seq) != self._length_of_seqs:
                raise ValueError(
                    "Expected all sequences to have length %d; found %d"
                    % (self._length_of_seqs, len(seq))
                )
            if "." in seq:
                raise ValueError("PHYLIP format no longer allows dots in sequence")

        coordinates = Alignment.infer_coordinates(seqs)
        seqs = [seq.replace("-", "") for seq in seqs]
        records = [
            SeqRecord(Seq(seq), id=name, description="")
            for (name, seq) in zip(names, seqs)
        ]
        alignment = Alignment(records, coordinates)
        del self._number_of_seqs
        del self._length_of_seqs
        self._close()
        return alignment