aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2009 by Cymon J. Cox. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for the multiple alignment programme MAFFT."""
from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline
class MafftCommandline(AbstractCommandline):
"""Command line wrapper for the multiple alignment program MAFFT.
http://align.bmr.kyushu-u.ac.jp/mafft/software/
Notes
-----
Last checked against version: MAFFT v6.717b (2009/12/03)
References
----------
Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of
multiple ncRNA alignment by incorporating structural information into
a MAFFT-based framework (describes RNA structural alignment methods)
Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent
developments in the MAFFT multiple sequence alignment program
(outlines version 6)
Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an
algorithm to build an approximate tree from a large number of
unaligned sequences (describes the PartTree algorithm)
Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT
version 5: improvement in accuracy of multiple sequence alignment
(describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i
strategies)
Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002)
Examples
--------
>>> from Bio.Align.Applications import MafftCommandline
>>> mafft_exe = "/opt/local/mafft"
>>> in_file = "../Doc/examples/opuntia.fasta"
>>> mafft_cline = MafftCommandline(mafft_exe, input=in_file)
>>> print(mafft_cline)
/opt/local/mafft ../Doc/examples/opuntia.fasta
If the mafft binary is on the path (typically the case on a Unix style
operating system) then you don't need to supply the executable location:
>>> from Bio.Align.Applications import MafftCommandline
>>> in_file = "../Doc/examples/opuntia.fasta"
>>> mafft_cline = MafftCommandline(input=in_file)
>>> print(mafft_cline)
mafft ../Doc/examples/opuntia.fasta
You would typically run the command line with mafft_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
Note that MAFFT will write the alignment to stdout, which you may
want to save to a file and then parse, e.g.::
stdout, stderr = mafft_cline()
with open("aligned.fasta", "w") as handle:
handle.write(stdout)
from Bio import AlignIO
align = AlignIO.read("aligned.fasta", "fasta")
Alternatively, to parse the output with AlignIO directly you can
use StringIO to turn the string into a handle::
stdout, stderr = mafft_cline()
from io import StringIO
from Bio import AlignIO
align = AlignIO.read(StringIO(stdout), "fasta")
"""
def __init__(self, cmd="mafft", **kwargs):
"""Initialize the class."""
BLOSUM_MATRICES = ["30", "45", "62", "80"]
self.parameters = [
# **** Algorithm ****
# Automatically selects an appropriate strategy from L-INS-i, FFT-NS-
# i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2)
_Switch(["--auto", "auto"], "Automatically select strategy. Default off."),
# Distance is calculated based on the number of shared 6mers. Default: on
_Switch(
["--6merpair", "6merpair", "sixmerpair"],
"Distance is calculated based on the number of shared "
"6mers. Default: on",
),
# All pairwise alignments are computed with the Needleman-Wunsch
# algorithm. More accurate but slower than --6merpair. Suitable for a
# set of globally alignable sequences. Applicable to up to ~200
# sequences. A combination with --maxiterate 1000 is recommended (G-
# INS-i). Default: off (6mer distance is used)
_Switch(
["--globalpair", "globalpair"],
"All pairwise alignments are computed with the "
"Needleman-Wunsch algorithm. Default: off",
),
# All pairwise alignments are computed with the Smith-Waterman
# algorithm. More accurate but slower than --6merpair. Suitable for a
# set of locally alignable sequences. Applicable to up to ~200
# sequences. A combination with --maxiterate 1000 is recommended (L-
# INS-i). Default: off (6mer distance is used)
_Switch(
["--localpair", "localpair"],
"All pairwise alignments are computed with the "
"Smith-Waterman algorithm. Default: off",
),
# All pairwise alignments are computed with a local algorithm with
# the generalized affine gap cost (Altschul 1998). More accurate but
# slower than --6merpair. Suitable when large internal gaps are
# expected. Applicable to up to ~200 sequences. A combination with --
# maxiterate 1000 is recommended (E-INS-i). Default: off (6mer
# distance is used)
_Switch(
["--genafpair", "genafpair"],
"All pairwise alignments are computed with a local "
"algorithm with the generalized affine gap cost "
"(Altschul 1998). Default: off",
),
# All pairwise alignments are computed with FASTA (Pearson and Lipman
# 1988). FASTA is required. Default: off (6mer distance is used)
_Switch(
["--fastapair", "fastapair"],
"All pairwise alignments are computed with FASTA "
"(Pearson and Lipman 1988). Default: off",
),
# Weighting factor for the consistency term calculated from pairwise
# alignments. Valid when either of --blobalpair, --localpair, --
# genafpair, --fastapair or --blastpair is selected. Default: 2.7
_Option(
["--weighti", "weighti"],
"Weighting factor for the consistency term calculated "
"from pairwise alignments. Default: 2.7",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# Guide tree is built number times in the progressive stage. Valid
# with 6mer distance. Default: 2
_Option(
["--retree", "retree"],
"Guide tree is built number times in the progressive "
"stage. Valid with 6mer distance. Default: 2",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Number cycles of iterative refinement are performed. Default: 0
_Option(
["--maxiterate", "maxiterate"],
"Number cycles of iterative refinement are performed. Default: 0",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Number of threads to use. Default: 1
_Option(
["--thread", "thread"],
"Number of threads to use. Default: 1",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Use FFT approximation in group-to-group alignment. Default: on
_Switch(
["--fft", "fft"],
"Use FFT approximation in group-to-group alignment. Default: on",
),
# Do not use FFT approximation in group-to-group alignment. Default:
# off
_Switch(
["--nofft", "nofft"],
"Do not use FFT approximation in group-to-group "
"alignment. Default: off",
),
# Alignment score is not checked in the iterative refinement stage.
# Default: off (score is checked)
_Switch(
["--noscore", "noscore"],
"Alignment score is not checked in the iterative "
"refinement stage. Default: off (score is checked)",
),
# Use the Myers-Miller (1988) algorithm. Default: automatically
# turned on when the alignment length exceeds 10,000 (aa/nt).
_Switch(
["--memsave", "memsave"],
"Use the Myers-Miller (1988) algorithm. Default: "
"automatically turned on when the alignment length "
"exceeds 10,000 (aa/nt).",
),
# Use a fast tree-building method (PartTree, Katoh and Toh 2007) with
# the 6mer distance. Recommended for a large number (> ~10,000) of
# sequences are input. Default: off
_Switch(
["--parttree", "parttree"],
"Use a fast tree-building method with the 6mer "
"distance. Default: off",
),
# The PartTree algorithm is used with distances based on DP. Slightly
# more accurate and slower than --parttree. Recommended for a large
# number (> ~10,000) of sequences are input. Default: off
_Switch(
["--dpparttree", "dpparttree"],
"The PartTree algorithm is used with distances "
"based on DP. Default: off",
),
# The PartTree algorithm is used with distances based on FASTA.
# Slightly more accurate and slower than --parttree. Recommended for
# a large number (> ~10,000) of sequences are input. FASTA is
# required. Default: off
_Switch(
["--fastaparttree", "fastaparttree"],
"The PartTree algorithm is used with distances based "
"on FASTA. Default: off",
),
# The number of partitions in the PartTree algorithm. Default: 50
_Option(
["--partsize", "partsize"],
"The number of partitions in the PartTree algorithm. Default: 50",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Do not make alignment larger than number sequences. Valid only with
# the --*parttree options. Default: the number of input sequences
_Switch(
["--groupsize", "groupsize"],
"Do not make alignment larger than number sequences. "
"Default: the number of input sequences",
),
# Adjust direction according to the first sequence
# Mafft V6 beta function
_Switch(
["--adjustdirection", "adjustdirection"],
"Adjust direction according to the first sequence. Default off.",
),
# Adjust direction according to the first sequence
# for highly diverged data; very slow
# Mafft V6 beta function
_Switch(
["--adjustdirectionaccurately", "adjustdirectionaccurately"],
"Adjust direction according to the first sequence,"
"for highly diverged data; very slow"
"Default off.",
),
# **** Parameter ****
# Gap opening penalty at group-to-group alignment. Default: 1.53
_Option(
["--op", "op"],
"Gap opening penalty at group-to-group alignment. Default: 1.53",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# Offset value, which works like gap extension penalty, for group-to-
# group alignment. Default: 0.123
_Option(
["--ep", "ep"],
"Offset value, which works like gap extension penalty, "
"for group-to- group alignment. Default: 0.123",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# Gap opening penalty at local pairwise alignment. Valid when the --
# localpair or --genafpair option is selected. Default: -2.00
_Option(
["--lop", "lop"],
"Gap opening penalty at local pairwise alignment. Default: 0.123",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# Offset value at local pairwise alignment. Valid when the --
# localpair or --genafpair option is selected. Default: 0.1
_Option(
["--lep", "lep"],
"Offset value at local pairwise alignment. Default: 0.1",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# Gap extension penalty at local pairwise alignment. Valid when the -
# -localpair or --genafpair option is selected. Default: -0.1
_Option(
["--lexp", "lexp"],
"Gap extension penalty at local pairwise alignment. Default: -0.1",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# Gap opening penalty to skip the alignment. Valid when the --
# genafpair option is selected. Default: -6.00
_Option(
["--LOP", "LOP"],
"Gap opening penalty to skip the alignment. Default: -6.00",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# Gap extension penalty to skip the alignment. Valid when the --
# genafpair option is selected. Default: 0.00
_Option(
["--LEXP", "LEXP"],
"Gap extension penalty to skip the alignment. Default: 0.00",
checker_function=lambda x: isinstance(x, float),
equate=False,
),
# BLOSUM number matrix (Henikoff and Henikoff 1992) is used.
# number=30, 45, 62 or 80. Default: 62
_Option(
["--bl", "bl"],
"BLOSUM number matrix is used. Default: 62",
checker_function=lambda x: x in BLOSUM_MATRICES,
equate=False,
),
# JTT PAM number (Jones et al. 1992) matrix is used. number>0.
# Default: BLOSUM62
_Option(
["--jtt", "jtt"],
"JTT PAM number (Jones et al. 1992) matrix is used. "
"number>0. Default: BLOSUM62",
equate=False,
),
# Transmembrane PAM number (Jones et al. 1994) matrix is used.
# number>0. Default: BLOSUM62
_Option(
["--tm", "tm"],
"Transmembrane PAM number (Jones et al. 1994) "
"matrix is used. number>0. Default: BLOSUM62",
filename=True, # to ensure spaced inputs are quoted
equate=False,
),
# Use a user-defined AA scoring matrix. The format of matrixfile is
# the same to that of BLAST. Ignored when nucleotide sequences are
# input. Default: BLOSUM62
_Option(
["--aamatrix", "aamatrix"],
"Use a user-defined AA scoring matrix. Default: BLOSUM62",
filename=True, # to ensure spaced inputs are quoted
equate=False,
),
# Incorporate the AA/nuc composition information into the scoring
# matrix. Default: off
_Switch(
["--fmodel", "fmodel"],
"Incorporate the AA/nuc composition information into "
"the scoring matrix (True) or not (False, default)",
),
# **** Output ****
# Name length for CLUSTAL and PHYLIP format output
_Option(
["--namelength", "namelength"],
"""Name length in CLUSTAL and PHYLIP output.
MAFFT v6.847 (2011) added --namelength for use with
the --clustalout option for CLUSTAL output.
MAFFT v7.024 (2013) added support for this with the
--phylipout option for PHYLIP output (default 10).
""",
checker_function=lambda x: isinstance(x, int),
equate=False,
),
# Output format: clustal format. Default: off (fasta format)
_Switch(
["--clustalout", "clustalout"],
"Output format: clustal (True) or fasta (False, default)",
),
# Output format: phylip format.
# Added in beta with v6.847, fixed in v6.850 (2011)
_Switch(
["--phylipout", "phylipout"],
"Output format: phylip (True), or fasta (False, default)",
),
# Output order: same as input. Default: on
_Switch(
["--inputorder", "inputorder"],
"Output order: same as input (True, default) or alignment "
"based (False)",
),
# Output order: aligned. Default: off (inputorder)
_Switch(
["--reorder", "reorder"],
"Output order: aligned (True) or in input order (False, default)",
),
# Guide tree is output to the input.tree file. Default: off
_Switch(
["--treeout", "treeout"],
"Guide tree is output to the input.tree file (True) or "
"not (False, default)",
),
# Do not report progress. Default: off
_Switch(
["--quiet", "quiet"],
"Do not report progress (True) or not (False, default).",
),
# **** Input ****
# Assume the sequences are nucleotide. Default: auto
_Switch(
["--nuc", "nuc"],
"Assume the sequences are nucleotide (True/False). Default: auto",
),
# Assume the sequences are amino acid. Default: auto
_Switch(
["--amino", "amino"],
"Assume the sequences are amino acid (True/False). Default: auto",
),
# MAFFT has multiple --seed commands where the unaligned input is
# aligned to the seed alignment. There can be multiple seeds in the
# form: "mafft --seed align1 --seed align2 [etc] input"
# Effectively for n number of seed alignments.
# TODO - Can we use class _ArgumentList here?
_Option(
["--seed", "seed"],
"Seed alignments given in alignment_n (fasta format) "
"are aligned with sequences in input.",
filename=True,
equate=False,
),
# The input (must be FASTA format)
_Argument(["input"], "Input file name", filename=True, is_required=True),
# mafft-profile takes a second alignment input as an argument:
# mafft-profile align1 align2
_Argument(
["input1"],
"Second input file name for the mafft-profile command",
filename=True,
),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()