Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /Align /phylip.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

6.44 kB

	# Copyright 2006-2016 by Peter Cock. All rights reserved.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	"""Bio.Align support for the alignment format for input files for PHYLIP tools.

	You are expected to use this module via the Bio.Align functions.
	"""
	from Bio.Align import Alignment
	from Bio.Align import interfaces
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord


	_PHYLIP_ID_WIDTH = 10


	class AlignmentWriter(interfaces.AlignmentWriter):
	"""Clustalw alignment writer."""

	fmt = "PHYLIP"

	def format_alignment(self, alignment):
	"""Return a string with a single alignment in the Phylip format."""
	names = []
	for record in alignment.sequences:
	try:
	name = record.id
	except AttributeError:
	name = ""
	else:
	name = name.strip()
	for char in "[](),":
	name = name.replace(char, "")
	for char in ":;":
	name = name.replace(char, "\|")
	name = name[:_PHYLIP_ID_WIDTH]
	names.append(name)

	lines = []
	nseqs, length = alignment.shape
	if nseqs == 0:
	raise ValueError("Must have at least one sequence")
	if length == 0:
	raise ValueError("Non-empty sequences are required")
	line = "%d %d\n" % (nseqs, length)
	lines.append(line)

	# From experimentation, the use of tabs is not understood by the
	# EMBOSS suite. The nature of the expected white space is not
	# defined in the PHYLIP documentation, simply "These are in free
	# format, separated by blanks". We'll use spaces to keep EMBOSS
	# happy.
	for name, sequence in zip(names, alignment):
	# Write the entire sequence to one line
	line = name[:_PHYLIP_ID_WIDTH].ljust(_PHYLIP_ID_WIDTH) + sequence + "\n"
	lines.append(line)
	return "".join(lines)


	class AlignmentIterator(interfaces.AlignmentIterator):
	"""Reads a Phylip alignment file and returns an Alignment iterator.

	Record names are limited to at most 10 characters.

	The parser determines from the file contents if the file format is
	sequential or interleaved, and parses the file accordingly.

	For more information on the file format, please see:
	http://evolution.genetics.washington.edu/phylip/doc/sequence.html
	http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
	"""

	fmt = "PHYLIP"

	def _read_header(self, stream):
	try:
	line = next(stream)
	except StopIteration:
	raise ValueError("Empty file.") from None

	words = line.split()
	if len(words) == 2:
	try:
	self._number_of_seqs = int(words[0])
	self._length_of_seqs = int(words[1])
	return
	except ValueError:
	pass
	raise ValueError(
	"Expected two integers in the first line, received '%s'" % line
	)

	def _parse_interleaved_first_block(self, lines, seqs, names):
	for line in lines:
	line = line.rstrip()
	name = line[:_PHYLIP_ID_WIDTH].strip()
	seq = line[_PHYLIP_ID_WIDTH:].strip().replace(" ", "")
	names.append(name)
	seqs.append([seq])

	def _parse_interleaved_other_blocks(self, stream, seqs):
	i = 0
	for line in stream:
	line = line.rstrip()
	if not line:
	assert i == self._number_of_seqs
	i = 0
	else:
	seq = line.replace(" ", "")
	seqs[i].append(seq)
	i += 1
	if i != 0 and i != self._number_of_seqs:
	raise ValueError("Unexpected file format")

	def _parse_sequential(self, lines, seqs, names, length):
	for line in lines:
	if length == 0:
	line = line.rstrip()
	name = line[:_PHYLIP_ID_WIDTH].strip()
	seq = line[_PHYLIP_ID_WIDTH:].strip()
	names.append(name)
	seqs.append([])
	else:
	seq = line.strip()
	seq = seq.replace(" ", "")
	seqs[-1].append(seq)
	length += len(seq)
	if length == self._length_of_seqs:
	length = 0
	return length

	def _read_file(self, stream):
	names = []
	seqs = []
	lines = [next(stream) for i in range(self._number_of_seqs)]
	try:
	line = next(stream)
	except StopIteration:
	pass
	else:
	if line.rstrip():
	# sequential file format
	lines.append(line)
	length = self._parse_sequential(lines, seqs, names, 0)
	self._parse_sequential(stream, seqs, names, length)
	return names, seqs
	# interleaved file format
	self._parse_interleaved_first_block(lines, seqs, names)
	self._parse_interleaved_other_blocks(stream, seqs)
	return names, seqs

	def _read_next_alignment(self, stream):
	names, seqs = self._read_file(stream)

	seqs = ["".join(seq) for seq in seqs]
	if len(seqs) != self._number_of_seqs:
	raise ValueError(
	"Found %i records in this alignment, told to expect %i"
	% (len(seqs), self._number_of_seqs)
	)
	for seq in seqs:
	if len(seq) != self._length_of_seqs:
	raise ValueError(
	"Expected all sequences to have length %d; found %d"
	% (self._length_of_seqs, len(seq))
	)
	if "." in seq:
	raise ValueError("PHYLIP format no longer allows dots in sequence")

	coordinates = Alignment.infer_coordinates(seqs)
	seqs = [seq.replace("-", "") for seq in seqs]
	records = [
	SeqRecord(Seq(seq), id=name, description="")
	for (name, seq) in zip(names, seqs)
	]
	alignment = Alignment(records, coordinates)
	del self._number_of_seqs
	del self._length_of_seqs
	self._close()
	return alignment