Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /Align /Applications /_Mafft.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

19.9 kB

	# Copyright 2009 by Cymon J. Cox. All rights reserved.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	"""Command line wrapper for the multiple alignment programme MAFFT."""


	from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline


	class MafftCommandline(AbstractCommandline):
	"""Command line wrapper for the multiple alignment program MAFFT.

	http://align.bmr.kyushu-u.ac.jp/mafft/software/

	Notes
	-----
	Last checked against version: MAFFT v6.717b (2009/12/03)

	References
	----------
	Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of
	multiple ncRNA alignment by incorporating structural information into
	a MAFFT-based framework (describes RNA structural alignment methods)

	Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent
	developments in the MAFFT multiple sequence alignment program
	(outlines version 6)

	Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an
	algorithm to build an approximate tree from a large number of
	unaligned sequences (describes the PartTree algorithm)

	Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT
	version 5: improvement in accuracy of multiple sequence alignment
	(describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i
	strategies)

	Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002)

	Examples
	--------
	>>> from Bio.Align.Applications import MafftCommandline
	>>> mafft_exe = "/opt/local/mafft"
	>>> in_file = "../Doc/examples/opuntia.fasta"
	>>> mafft_cline = MafftCommandline(mafft_exe, input=in_file)
	>>> print(mafft_cline)
	/opt/local/mafft ../Doc/examples/opuntia.fasta

	If the mafft binary is on the path (typically the case on a Unix style
	operating system) then you don't need to supply the executable location:

	>>> from Bio.Align.Applications import MafftCommandline
	>>> in_file = "../Doc/examples/opuntia.fasta"
	>>> mafft_cline = MafftCommandline(input=in_file)
	>>> print(mafft_cline)
	mafft ../Doc/examples/opuntia.fasta

	You would typically run the command line with mafft_cline() or via
	the Python subprocess module, as described in the Biopython tutorial.

	Note that MAFFT will write the alignment to stdout, which you may
	want to save to a file and then parse, e.g.::

	stdout, stderr = mafft_cline()
	with open("aligned.fasta", "w") as handle:
	handle.write(stdout)
	from Bio import AlignIO
	align = AlignIO.read("aligned.fasta", "fasta")

	Alternatively, to parse the output with AlignIO directly you can
	use StringIO to turn the string into a handle::

	stdout, stderr = mafft_cline()
	from io import StringIO
	from Bio import AlignIO
	align = AlignIO.read(StringIO(stdout), "fasta")

	"""

	def __init__(self, cmd="mafft", **kwargs):
	"""Initialize the class."""
	BLOSUM_MATRICES = ["30", "45", "62", "80"]
	self.parameters = [
	# ** Algorithm **
	# Automatically selects an appropriate strategy from L-INS-i, FFT-NS-
	# i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2)
	_Switch(["--auto", "auto"], "Automatically select strategy. Default off."),
	# Distance is calculated based on the number of shared 6mers. Default: on
	_Switch(
	["--6merpair", "6merpair", "sixmerpair"],
	"Distance is calculated based on the number of shared "
	"6mers. Default: on",
	),
	# All pairwise alignments are computed with the Needleman-Wunsch
	# algorithm. More accurate but slower than --6merpair. Suitable for a
	# set of globally alignable sequences. Applicable to up to ~200
	# sequences. A combination with --maxiterate 1000 is recommended (G-
	# INS-i). Default: off (6mer distance is used)
	_Switch(
	["--globalpair", "globalpair"],
	"All pairwise alignments are computed with the "
	"Needleman-Wunsch algorithm. Default: off",
	),
	# All pairwise alignments are computed with the Smith-Waterman
	# algorithm. More accurate but slower than --6merpair. Suitable for a
	# set of locally alignable sequences. Applicable to up to ~200
	# sequences. A combination with --maxiterate 1000 is recommended (L-
	# INS-i). Default: off (6mer distance is used)
	_Switch(
	["--localpair", "localpair"],
	"All pairwise alignments are computed with the "
	"Smith-Waterman algorithm. Default: off",
	),
	# All pairwise alignments are computed with a local algorithm with
	# the generalized affine gap cost (Altschul 1998). More accurate but
	# slower than --6merpair. Suitable when large internal gaps are
	# expected. Applicable to up to ~200 sequences. A combination with --
	# maxiterate 1000 is recommended (E-INS-i). Default: off (6mer
	# distance is used)
	_Switch(
	["--genafpair", "genafpair"],
	"All pairwise alignments are computed with a local "
	"algorithm with the generalized affine gap cost "
	"(Altschul 1998). Default: off",
	),
	# All pairwise alignments are computed with FASTA (Pearson and Lipman
	# 1988). FASTA is required. Default: off (6mer distance is used)
	_Switch(
	["--fastapair", "fastapair"],
	"All pairwise alignments are computed with FASTA "
	"(Pearson and Lipman 1988). Default: off",
	),
	# Weighting factor for the consistency term calculated from pairwise
	# alignments. Valid when either of --blobalpair, --localpair, --
	# genafpair, --fastapair or --blastpair is selected. Default: 2.7
	_Option(
	["--weighti", "weighti"],
	"Weighting factor for the consistency term calculated "
	"from pairwise alignments. Default: 2.7",
	checker_function=lambda x: isinstance(x, float),
	equate=False,
	),
	# Guide tree is built number times in the progressive stage. Valid
	# with 6mer distance. Default: 2
	_Option(
	["--retree", "retree"],
	"Guide tree is built number times in the progressive "
	"stage. Valid with 6mer distance. Default: 2",
	checker_function=lambda x: isinstance(x, int),
	equate=False,
	),
	# Number cycles of iterative refinement are performed. Default: 0
	_Option(
	["--maxiterate", "maxiterate"],
	"Number cycles of iterative refinement are performed. Default: 0",
	checker_function=lambda x: isinstance(x, int),
	equate=False,
	),
	# Number of threads to use. Default: 1
	_Option(
	["--thread", "thread"],
	"Number of threads to use. Default: 1",
	checker_function=lambda x: isinstance(x, int),
	equate=False,
	),
	# Use FFT approximation in group-to-group alignment. Default: on
	_Switch(
	["--fft", "fft"],
	"Use FFT approximation in group-to-group alignment. Default: on",
	),
	# Do not use FFT approximation in group-to-group alignment. Default:
	# off
	_Switch(
	["--nofft", "nofft"],
	"Do not use FFT approximation in group-to-group "
	"alignment. Default: off",
	),
	# Alignment score is not checked in the iterative refinement stage.
	# Default: off (score is checked)
	_Switch(
	["--noscore", "noscore"],
	"Alignment score is not checked in the iterative "
	"refinement stage. Default: off (score is checked)",
	),
	# Use the Myers-Miller (1988) algorithm. Default: automatically
	# turned on when the alignment length exceeds 10,000 (aa/nt).
	_Switch(
	["--memsave", "memsave"],
	"Use the Myers-Miller (1988) algorithm. Default: "
	"automatically turned on when the alignment length "
	"exceeds 10,000 (aa/nt).",
	),
	# Use a fast tree-building method (PartTree, Katoh and Toh 2007) with
	# the 6mer distance. Recommended for a large number (> ~10,000) of
	# sequences are input. Default: off
	_Switch(
	["--parttree", "parttree"],
	"Use a fast tree-building method with the 6mer "
	"distance. Default: off",
	),
	# The PartTree algorithm is used with distances based on DP. Slightly
	# more accurate and slower than --parttree. Recommended for a large
	# number (> ~10,000) of sequences are input. Default: off
	_Switch(
	["--dpparttree", "dpparttree"],
	"The PartTree algorithm is used with distances "
	"based on DP. Default: off",
	),
	# The PartTree algorithm is used with distances based on FASTA.
	# Slightly more accurate and slower than --parttree. Recommended for
	# a large number (> ~10,000) of sequences are input. FASTA is
	# required. Default: off
	_Switch(
	["--fastaparttree", "fastaparttree"],
	"The PartTree algorithm is used with distances based "
	"on FASTA. Default: off",
	),
	# The number of partitions in the PartTree algorithm. Default: 50
	_Option(
	["--partsize", "partsize"],
	"The number of partitions in the PartTree algorithm. Default: 50",
	checker_function=lambda x: isinstance(x, int),
	equate=False,
	),
	# Do not make alignment larger than number sequences. Valid only with
	# the --*parttree options. Default: the number of input sequences
	_Switch(
	["--groupsize", "groupsize"],
	"Do not make alignment larger than number sequences. "
	"Default: the number of input sequences",
	),
	# Adjust direction according to the first sequence
	# Mafft V6 beta function
	_Switch(
	["--adjustdirection", "adjustdirection"],
	"Adjust direction according to the first sequence. Default off.",
	),
	# Adjust direction according to the first sequence
	# for highly diverged data; very slow
	# Mafft V6 beta function
	_Switch(
	["--adjustdirectionaccurately", "adjustdirectionaccurately"],
	"Adjust direction according to the first sequence,"
	"for highly diverged data; very slow"
	"Default off.",
	),
	# ** Parameter **
	# Gap opening penalty at group-to-group alignment. Default: 1.53
	_Option(
	["--op", "op"],
	"Gap opening penalty at group-to-group alignment. Default: 1.53",
	checker_function=lambda x: isinstance(x, float),
	equate=False,
	),
	# Offset value, which works like gap extension penalty, for group-to-
	# group alignment. Default: 0.123
	_Option(
	["--ep", "ep"],
	"Offset value, which works like gap extension penalty, "
	"for group-to- group alignment. Default: 0.123",
	checker_function=lambda x: isinstance(x, float),
	equate=False,
	),
	# Gap opening penalty at local pairwise alignment. Valid when the --
	# localpair or --genafpair option is selected. Default: -2.00
	_Option(
	["--lop", "lop"],
	"Gap opening penalty at local pairwise alignment. Default: 0.123",
	checker_function=lambda x: isinstance(x, float),
	equate=False,
	),
	# Offset value at local pairwise alignment. Valid when the --
	# localpair or --genafpair option is selected. Default: 0.1
	_Option(
	["--lep", "lep"],
	"Offset value at local pairwise alignment. Default: 0.1",
	checker_function=lambda x: isinstance(x, float),
	equate=False,
	),
	# Gap extension penalty at local pairwise alignment. Valid when the -
	# -localpair or --genafpair option is selected. Default: -0.1
	_Option(
	["--lexp", "lexp"],
	"Gap extension penalty at local pairwise alignment. Default: -0.1",
	checker_function=lambda x: isinstance(x, float),
	equate=False,
	),
	# Gap opening penalty to skip the alignment. Valid when the --
	# genafpair option is selected. Default: -6.00
	_Option(
	["--LOP", "LOP"],
	"Gap opening penalty to skip the alignment. Default: -6.00",
	checker_function=lambda x: isinstance(x, float),
	equate=False,
	),
	# Gap extension penalty to skip the alignment. Valid when the --
	# genafpair option is selected. Default: 0.00
	_Option(
	["--LEXP", "LEXP"],
	"Gap extension penalty to skip the alignment. Default: 0.00",
	checker_function=lambda x: isinstance(x, float),
	equate=False,
	),
	# BLOSUM number matrix (Henikoff and Henikoff 1992) is used.
	# number=30, 45, 62 or 80. Default: 62
	_Option(
	["--bl", "bl"],
	"BLOSUM number matrix is used. Default: 62",
	checker_function=lambda x: x in BLOSUM_MATRICES,
	equate=False,
	),
	# JTT PAM number (Jones et al. 1992) matrix is used. number>0.
	# Default: BLOSUM62
	_Option(
	["--jtt", "jtt"],
	"JTT PAM number (Jones et al. 1992) matrix is used. "
	"number>0. Default: BLOSUM62",
	equate=False,
	),
	# Transmembrane PAM number (Jones et al. 1994) matrix is used.
	# number>0. Default: BLOSUM62
	_Option(
	["--tm", "tm"],
	"Transmembrane PAM number (Jones et al. 1994) "
	"matrix is used. number>0. Default: BLOSUM62",
	filename=True, # to ensure spaced inputs are quoted
	equate=False,
	),
	# Use a user-defined AA scoring matrix. The format of matrixfile is
	# the same to that of BLAST. Ignored when nucleotide sequences are
	# input. Default: BLOSUM62
	_Option(
	["--aamatrix", "aamatrix"],
	"Use a user-defined AA scoring matrix. Default: BLOSUM62",
	filename=True, # to ensure spaced inputs are quoted
	equate=False,
	),
	# Incorporate the AA/nuc composition information into the scoring
	# matrix. Default: off
	_Switch(
	["--fmodel", "fmodel"],
	"Incorporate the AA/nuc composition information into "
	"the scoring matrix (True) or not (False, default)",
	),
	# ** Output **
	# Name length for CLUSTAL and PHYLIP format output
	_Option(
	["--namelength", "namelength"],
	"""Name length in CLUSTAL and PHYLIP output.

	MAFFT v6.847 (2011) added --namelength for use with
	the --clustalout option for CLUSTAL output.

	MAFFT v7.024 (2013) added support for this with the
	--phylipout option for PHYLIP output (default 10).
	""",
	checker_function=lambda x: isinstance(x, int),
	equate=False,
	),
	# Output format: clustal format. Default: off (fasta format)
	_Switch(
	["--clustalout", "clustalout"],
	"Output format: clustal (True) or fasta (False, default)",
	),
	# Output format: phylip format.
	# Added in beta with v6.847, fixed in v6.850 (2011)
	_Switch(
	["--phylipout", "phylipout"],
	"Output format: phylip (True), or fasta (False, default)",
	),
	# Output order: same as input. Default: on
	_Switch(
	["--inputorder", "inputorder"],
	"Output order: same as input (True, default) or alignment "
	"based (False)",
	),
	# Output order: aligned. Default: off (inputorder)
	_Switch(
	["--reorder", "reorder"],
	"Output order: aligned (True) or in input order (False, default)",
	),
	# Guide tree is output to the input.tree file. Default: off
	_Switch(
	["--treeout", "treeout"],
	"Guide tree is output to the input.tree file (True) or "
	"not (False, default)",
	),
	# Do not report progress. Default: off
	_Switch(
	["--quiet", "quiet"],
	"Do not report progress (True) or not (False, default).",
	),
	# ** Input **
	# Assume the sequences are nucleotide. Default: auto
	_Switch(
	["--nuc", "nuc"],
	"Assume the sequences are nucleotide (True/False). Default: auto",
	),
	# Assume the sequences are amino acid. Default: auto
	_Switch(
	["--amino", "amino"],
	"Assume the sequences are amino acid (True/False). Default: auto",
	),
	# MAFFT has multiple --seed commands where the unaligned input is
	# aligned to the seed alignment. There can be multiple seeds in the
	# form: "mafft --seed align1 --seed align2 [etc] input"
	# Effectively for n number of seed alignments.
	# TODO - Can we use class _ArgumentList here?
	_Option(
	["--seed", "seed"],
	"Seed alignments given in alignment_n (fasta format) "
	"are aligned with sequences in input.",
	filename=True,
	equate=False,
	),
	# The input (must be FASTA format)
	_Argument(["input"], "Input file name", filename=True, is_required=True),
	# mafft-profile takes a second alignment input as an argument:
	# mafft-profile align1 align2
	_Argument(
	["input1"],
	"Second input file name for the mafft-profile command",
	filename=True,
	),
	]
	AbstractCommandline.__init__(self, cmd, **kwargs)


	if __name__ == "__main__":
	from Bio._utils import run_doctest

	run_doctest()