Spaces:
No application file
No application file
# Copyright 2009 by Cymon J. Cox. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Command line wrapper for the multiple alignment programme MAFFT.""" | |
from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline | |
class MafftCommandline(AbstractCommandline): | |
"""Command line wrapper for the multiple alignment program MAFFT. | |
http://align.bmr.kyushu-u.ac.jp/mafft/software/ | |
Notes | |
----- | |
Last checked against version: MAFFT v6.717b (2009/12/03) | |
References | |
---------- | |
Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of | |
multiple ncRNA alignment by incorporating structural information into | |
a MAFFT-based framework (describes RNA structural alignment methods) | |
Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent | |
developments in the MAFFT multiple sequence alignment program | |
(outlines version 6) | |
Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an | |
algorithm to build an approximate tree from a large number of | |
unaligned sequences (describes the PartTree algorithm) | |
Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT | |
version 5: improvement in accuracy of multiple sequence alignment | |
(describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i | |
strategies) | |
Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002) | |
Examples | |
-------- | |
>>> from Bio.Align.Applications import MafftCommandline | |
>>> mafft_exe = "/opt/local/mafft" | |
>>> in_file = "../Doc/examples/opuntia.fasta" | |
>>> mafft_cline = MafftCommandline(mafft_exe, input=in_file) | |
>>> print(mafft_cline) | |
/opt/local/mafft ../Doc/examples/opuntia.fasta | |
If the mafft binary is on the path (typically the case on a Unix style | |
operating system) then you don't need to supply the executable location: | |
>>> from Bio.Align.Applications import MafftCommandline | |
>>> in_file = "../Doc/examples/opuntia.fasta" | |
>>> mafft_cline = MafftCommandline(input=in_file) | |
>>> print(mafft_cline) | |
mafft ../Doc/examples/opuntia.fasta | |
You would typically run the command line with mafft_cline() or via | |
the Python subprocess module, as described in the Biopython tutorial. | |
Note that MAFFT will write the alignment to stdout, which you may | |
want to save to a file and then parse, e.g.:: | |
stdout, stderr = mafft_cline() | |
with open("aligned.fasta", "w") as handle: | |
handle.write(stdout) | |
from Bio import AlignIO | |
align = AlignIO.read("aligned.fasta", "fasta") | |
Alternatively, to parse the output with AlignIO directly you can | |
use StringIO to turn the string into a handle:: | |
stdout, stderr = mafft_cline() | |
from io import StringIO | |
from Bio import AlignIO | |
align = AlignIO.read(StringIO(stdout), "fasta") | |
""" | |
def __init__(self, cmd="mafft", **kwargs): | |
"""Initialize the class.""" | |
BLOSUM_MATRICES = ["30", "45", "62", "80"] | |
self.parameters = [ | |
# **** Algorithm **** | |
# Automatically selects an appropriate strategy from L-INS-i, FFT-NS- | |
# i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2) | |
_Switch(["--auto", "auto"], "Automatically select strategy. Default off."), | |
# Distance is calculated based on the number of shared 6mers. Default: on | |
_Switch( | |
["--6merpair", "6merpair", "sixmerpair"], | |
"Distance is calculated based on the number of shared " | |
"6mers. Default: on", | |
), | |
# All pairwise alignments are computed with the Needleman-Wunsch | |
# algorithm. More accurate but slower than --6merpair. Suitable for a | |
# set of globally alignable sequences. Applicable to up to ~200 | |
# sequences. A combination with --maxiterate 1000 is recommended (G- | |
# INS-i). Default: off (6mer distance is used) | |
_Switch( | |
["--globalpair", "globalpair"], | |
"All pairwise alignments are computed with the " | |
"Needleman-Wunsch algorithm. Default: off", | |
), | |
# All pairwise alignments are computed with the Smith-Waterman | |
# algorithm. More accurate but slower than --6merpair. Suitable for a | |
# set of locally alignable sequences. Applicable to up to ~200 | |
# sequences. A combination with --maxiterate 1000 is recommended (L- | |
# INS-i). Default: off (6mer distance is used) | |
_Switch( | |
["--localpair", "localpair"], | |
"All pairwise alignments are computed with the " | |
"Smith-Waterman algorithm. Default: off", | |
), | |
# All pairwise alignments are computed with a local algorithm with | |
# the generalized affine gap cost (Altschul 1998). More accurate but | |
# slower than --6merpair. Suitable when large internal gaps are | |
# expected. Applicable to up to ~200 sequences. A combination with -- | |
# maxiterate 1000 is recommended (E-INS-i). Default: off (6mer | |
# distance is used) | |
_Switch( | |
["--genafpair", "genafpair"], | |
"All pairwise alignments are computed with a local " | |
"algorithm with the generalized affine gap cost " | |
"(Altschul 1998). Default: off", | |
), | |
# All pairwise alignments are computed with FASTA (Pearson and Lipman | |
# 1988). FASTA is required. Default: off (6mer distance is used) | |
_Switch( | |
["--fastapair", "fastapair"], | |
"All pairwise alignments are computed with FASTA " | |
"(Pearson and Lipman 1988). Default: off", | |
), | |
# Weighting factor for the consistency term calculated from pairwise | |
# alignments. Valid when either of --blobalpair, --localpair, -- | |
# genafpair, --fastapair or --blastpair is selected. Default: 2.7 | |
_Option( | |
["--weighti", "weighti"], | |
"Weighting factor for the consistency term calculated " | |
"from pairwise alignments. Default: 2.7", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# Guide tree is built number times in the progressive stage. Valid | |
# with 6mer distance. Default: 2 | |
_Option( | |
["--retree", "retree"], | |
"Guide tree is built number times in the progressive " | |
"stage. Valid with 6mer distance. Default: 2", | |
checker_function=lambda x: isinstance(x, int), | |
equate=False, | |
), | |
# Number cycles of iterative refinement are performed. Default: 0 | |
_Option( | |
["--maxiterate", "maxiterate"], | |
"Number cycles of iterative refinement are performed. Default: 0", | |
checker_function=lambda x: isinstance(x, int), | |
equate=False, | |
), | |
# Number of threads to use. Default: 1 | |
_Option( | |
["--thread", "thread"], | |
"Number of threads to use. Default: 1", | |
checker_function=lambda x: isinstance(x, int), | |
equate=False, | |
), | |
# Use FFT approximation in group-to-group alignment. Default: on | |
_Switch( | |
["--fft", "fft"], | |
"Use FFT approximation in group-to-group alignment. Default: on", | |
), | |
# Do not use FFT approximation in group-to-group alignment. Default: | |
# off | |
_Switch( | |
["--nofft", "nofft"], | |
"Do not use FFT approximation in group-to-group " | |
"alignment. Default: off", | |
), | |
# Alignment score is not checked in the iterative refinement stage. | |
# Default: off (score is checked) | |
_Switch( | |
["--noscore", "noscore"], | |
"Alignment score is not checked in the iterative " | |
"refinement stage. Default: off (score is checked)", | |
), | |
# Use the Myers-Miller (1988) algorithm. Default: automatically | |
# turned on when the alignment length exceeds 10,000 (aa/nt). | |
_Switch( | |
["--memsave", "memsave"], | |
"Use the Myers-Miller (1988) algorithm. Default: " | |
"automatically turned on when the alignment length " | |
"exceeds 10,000 (aa/nt).", | |
), | |
# Use a fast tree-building method (PartTree, Katoh and Toh 2007) with | |
# the 6mer distance. Recommended for a large number (> ~10,000) of | |
# sequences are input. Default: off | |
_Switch( | |
["--parttree", "parttree"], | |
"Use a fast tree-building method with the 6mer " | |
"distance. Default: off", | |
), | |
# The PartTree algorithm is used with distances based on DP. Slightly | |
# more accurate and slower than --parttree. Recommended for a large | |
# number (> ~10,000) of sequences are input. Default: off | |
_Switch( | |
["--dpparttree", "dpparttree"], | |
"The PartTree algorithm is used with distances " | |
"based on DP. Default: off", | |
), | |
# The PartTree algorithm is used with distances based on FASTA. | |
# Slightly more accurate and slower than --parttree. Recommended for | |
# a large number (> ~10,000) of sequences are input. FASTA is | |
# required. Default: off | |
_Switch( | |
["--fastaparttree", "fastaparttree"], | |
"The PartTree algorithm is used with distances based " | |
"on FASTA. Default: off", | |
), | |
# The number of partitions in the PartTree algorithm. Default: 50 | |
_Option( | |
["--partsize", "partsize"], | |
"The number of partitions in the PartTree algorithm. Default: 50", | |
checker_function=lambda x: isinstance(x, int), | |
equate=False, | |
), | |
# Do not make alignment larger than number sequences. Valid only with | |
# the --*parttree options. Default: the number of input sequences | |
_Switch( | |
["--groupsize", "groupsize"], | |
"Do not make alignment larger than number sequences. " | |
"Default: the number of input sequences", | |
), | |
# Adjust direction according to the first sequence | |
# Mafft V6 beta function | |
_Switch( | |
["--adjustdirection", "adjustdirection"], | |
"Adjust direction according to the first sequence. Default off.", | |
), | |
# Adjust direction according to the first sequence | |
# for highly diverged data; very slow | |
# Mafft V6 beta function | |
_Switch( | |
["--adjustdirectionaccurately", "adjustdirectionaccurately"], | |
"Adjust direction according to the first sequence," | |
"for highly diverged data; very slow" | |
"Default off.", | |
), | |
# **** Parameter **** | |
# Gap opening penalty at group-to-group alignment. Default: 1.53 | |
_Option( | |
["--op", "op"], | |
"Gap opening penalty at group-to-group alignment. Default: 1.53", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# Offset value, which works like gap extension penalty, for group-to- | |
# group alignment. Default: 0.123 | |
_Option( | |
["--ep", "ep"], | |
"Offset value, which works like gap extension penalty, " | |
"for group-to- group alignment. Default: 0.123", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# Gap opening penalty at local pairwise alignment. Valid when the -- | |
# localpair or --genafpair option is selected. Default: -2.00 | |
_Option( | |
["--lop", "lop"], | |
"Gap opening penalty at local pairwise alignment. Default: 0.123", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# Offset value at local pairwise alignment. Valid when the -- | |
# localpair or --genafpair option is selected. Default: 0.1 | |
_Option( | |
["--lep", "lep"], | |
"Offset value at local pairwise alignment. Default: 0.1", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# Gap extension penalty at local pairwise alignment. Valid when the - | |
# -localpair or --genafpair option is selected. Default: -0.1 | |
_Option( | |
["--lexp", "lexp"], | |
"Gap extension penalty at local pairwise alignment. Default: -0.1", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# Gap opening penalty to skip the alignment. Valid when the -- | |
# genafpair option is selected. Default: -6.00 | |
_Option( | |
["--LOP", "LOP"], | |
"Gap opening penalty to skip the alignment. Default: -6.00", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# Gap extension penalty to skip the alignment. Valid when the -- | |
# genafpair option is selected. Default: 0.00 | |
_Option( | |
["--LEXP", "LEXP"], | |
"Gap extension penalty to skip the alignment. Default: 0.00", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# BLOSUM number matrix (Henikoff and Henikoff 1992) is used. | |
# number=30, 45, 62 or 80. Default: 62 | |
_Option( | |
["--bl", "bl"], | |
"BLOSUM number matrix is used. Default: 62", | |
checker_function=lambda x: x in BLOSUM_MATRICES, | |
equate=False, | |
), | |
# JTT PAM number (Jones et al. 1992) matrix is used. number>0. | |
# Default: BLOSUM62 | |
_Option( | |
["--jtt", "jtt"], | |
"JTT PAM number (Jones et al. 1992) matrix is used. " | |
"number>0. Default: BLOSUM62", | |
equate=False, | |
), | |
# Transmembrane PAM number (Jones et al. 1994) matrix is used. | |
# number>0. Default: BLOSUM62 | |
_Option( | |
["--tm", "tm"], | |
"Transmembrane PAM number (Jones et al. 1994) " | |
"matrix is used. number>0. Default: BLOSUM62", | |
filename=True, # to ensure spaced inputs are quoted | |
equate=False, | |
), | |
# Use a user-defined AA scoring matrix. The format of matrixfile is | |
# the same to that of BLAST. Ignored when nucleotide sequences are | |
# input. Default: BLOSUM62 | |
_Option( | |
["--aamatrix", "aamatrix"], | |
"Use a user-defined AA scoring matrix. Default: BLOSUM62", | |
filename=True, # to ensure spaced inputs are quoted | |
equate=False, | |
), | |
# Incorporate the AA/nuc composition information into the scoring | |
# matrix. Default: off | |
_Switch( | |
["--fmodel", "fmodel"], | |
"Incorporate the AA/nuc composition information into " | |
"the scoring matrix (True) or not (False, default)", | |
), | |
# **** Output **** | |
# Name length for CLUSTAL and PHYLIP format output | |
_Option( | |
["--namelength", "namelength"], | |
"""Name length in CLUSTAL and PHYLIP output. | |
MAFFT v6.847 (2011) added --namelength for use with | |
the --clustalout option for CLUSTAL output. | |
MAFFT v7.024 (2013) added support for this with the | |
--phylipout option for PHYLIP output (default 10). | |
""", | |
checker_function=lambda x: isinstance(x, int), | |
equate=False, | |
), | |
# Output format: clustal format. Default: off (fasta format) | |
_Switch( | |
["--clustalout", "clustalout"], | |
"Output format: clustal (True) or fasta (False, default)", | |
), | |
# Output format: phylip format. | |
# Added in beta with v6.847, fixed in v6.850 (2011) | |
_Switch( | |
["--phylipout", "phylipout"], | |
"Output format: phylip (True), or fasta (False, default)", | |
), | |
# Output order: same as input. Default: on | |
_Switch( | |
["--inputorder", "inputorder"], | |
"Output order: same as input (True, default) or alignment " | |
"based (False)", | |
), | |
# Output order: aligned. Default: off (inputorder) | |
_Switch( | |
["--reorder", "reorder"], | |
"Output order: aligned (True) or in input order (False, default)", | |
), | |
# Guide tree is output to the input.tree file. Default: off | |
_Switch( | |
["--treeout", "treeout"], | |
"Guide tree is output to the input.tree file (True) or " | |
"not (False, default)", | |
), | |
# Do not report progress. Default: off | |
_Switch( | |
["--quiet", "quiet"], | |
"Do not report progress (True) or not (False, default).", | |
), | |
# **** Input **** | |
# Assume the sequences are nucleotide. Default: auto | |
_Switch( | |
["--nuc", "nuc"], | |
"Assume the sequences are nucleotide (True/False). Default: auto", | |
), | |
# Assume the sequences are amino acid. Default: auto | |
_Switch( | |
["--amino", "amino"], | |
"Assume the sequences are amino acid (True/False). Default: auto", | |
), | |
# MAFFT has multiple --seed commands where the unaligned input is | |
# aligned to the seed alignment. There can be multiple seeds in the | |
# form: "mafft --seed align1 --seed align2 [etc] input" | |
# Effectively for n number of seed alignments. | |
# TODO - Can we use class _ArgumentList here? | |
_Option( | |
["--seed", "seed"], | |
"Seed alignments given in alignment_n (fasta format) " | |
"are aligned with sequences in input.", | |
filename=True, | |
equate=False, | |
), | |
# The input (must be FASTA format) | |
_Argument(["input"], "Input file name", filename=True, is_required=True), | |
# mafft-profile takes a second alignment input as an argument: | |
# mafft-profile align1 align2 | |
_Argument( | |
["input1"], | |
"Second input file name for the mafft-profile command", | |
filename=True, | |
), | |
] | |
AbstractCommandline.__init__(self, cmd, **kwargs) | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest() | |