Spaces:
No application file
No application file
# Copyright 2009 by Cymon J. Cox. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Command line wrapper for the multiple alignment program MUSCLE.""" | |
from Bio.Application import _Option, _Switch, AbstractCommandline | |
class MuscleCommandline(AbstractCommandline): | |
r"""Command line wrapper for the multiple alignment program MUSCLE. | |
http://www.drive5.com/muscle/ | |
Notes | |
----- | |
Last checked against version: 3.7, briefly against 3.8 | |
References | |
---------- | |
Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high | |
accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97. | |
Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with | |
reduced time and space complexity. BMC Bioinformatics 5(1): 113. | |
Examples | |
-------- | |
>>> from Bio.Align.Applications import MuscleCommandline | |
>>> muscle_exe = r"C:\Program Files\Alignments\muscle3.8.31_i86win32.exe" | |
>>> in_file = r"C:\My Documents\unaligned.fasta" | |
>>> out_file = r"C:\My Documents\aligned.fasta" | |
>>> muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file) | |
>>> print(muscle_cline) | |
"C:\Program Files\Alignments\muscle3.8.31_i86win32.exe" -in "C:\My Documents\unaligned.fasta" -out "C:\My Documents\aligned.fasta" | |
You would typically run the command line with muscle_cline() or via | |
the Python subprocess module, as described in the Biopython tutorial. | |
""" | |
def __init__(self, cmd="muscle", **kwargs): | |
"""Initialize the class.""" | |
CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"] | |
DISTANCE_MEASURES_ITER1 = [ | |
"kmer6_6", | |
"kmer20_3", | |
"kmer20_4", | |
"kbit20_3", | |
"kmer4_6", | |
] | |
DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + [ | |
"pctid_kimura", | |
"pctid_log", | |
] | |
OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"] | |
TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"] | |
# The mucleotide arguments for the sequence type parameter in MUSCLE (-seqtype) | |
# were updated at somepoint in MUSCLE version 3.8. Prior to the update | |
# 'nucleo' was used for nucleotide. This has been updated to 'rna' and 'dna'. 'nucleo' kept for | |
# backwards compatibility with older MUSCLE versions. | |
SEQUENCE_TYPES = ["protein", "rna", "dna", "nucleo", "auto"] | |
WEIGHTING_SCHEMES = [ | |
"none", | |
"clustalw", | |
"henikoff", | |
"henikoffpb", | |
"gsc", | |
"threeway", | |
] | |
self.parameters = [ | |
# Can't use "in" as the final alias as this | |
# is a reserved word in python: | |
_Option( | |
["-in", "in", "input"], "Input filename", filename=True, equate=False | |
), | |
_Option(["-out", "out"], "Output filename", filename=True, equate=False), | |
_Switch( | |
["-diags", "diags"], "Find diagonals (faster for similar sequences)" | |
), | |
_Switch(["-profile", "profile"], "Perform a profile alignment"), | |
_Option( | |
["-in1", "in1"], | |
"First input filename for profile alignment", | |
filename=True, | |
equate=False, | |
), | |
_Option( | |
["-in2", "in2"], | |
"Second input filename for a profile alignment", | |
filename=True, | |
equate=False, | |
), | |
# anchorspacing Integer 32 Minimum spacing | |
# between anchor cols | |
_Option( | |
["-anchorspacing", "anchorspacing"], | |
"Minimum spacing between anchor columns", | |
checker_function=lambda x: isinstance(x, int), | |
equate=False, | |
), | |
# center Floating point [1] Center parameter. | |
# Should be negative. | |
_Option( | |
["-center", "center"], | |
"Center parameter - should be negative", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# cluster1 upgma upgmb Clustering method. | |
_Option( | |
["-cluster1", "cluster1"], | |
"Clustering method used in iteration 1", | |
checker_function=lambda x: x in CLUSTERING_ALGORITHMS, | |
equate=False, | |
), | |
# cluster2 upgmb cluster1 is used | |
# neighborjoining in iteration 1 and | |
# 2, cluster2 in | |
# later iterations. | |
_Option( | |
["-cluster2", "cluster2"], | |
"Clustering method used in iteration 2", | |
checker_function=lambda x: x in CLUSTERING_ALGORITHMS, | |
equate=False, | |
), | |
# diaglength Integer 24 Minimum length of | |
# diagonal. | |
_Option( | |
["-diaglength", "diaglength"], | |
"Minimum length of diagonal", | |
checker_function=lambda x: isinstance(x, int), | |
equate=True, | |
), | |
# diagmargin Integer 5 Discard this many | |
# positions at ends | |
# of diagonal. | |
_Option( | |
["-diagmargin", "diagmargin"], | |
"Discard this many positions at ends of diagonal", | |
checker_function=lambda x: isinstance(x, int), | |
equate=False, | |
), | |
# distance1 kmer6_6 Kmer6_6(amino) or Distance measure | |
# kmer20_3 Kmer4_6(nucleo) for iteration 1 | |
# kmer20_4 | |
# kbit20_3 | |
# kmer4_6 | |
_Option( | |
["-distance1", "distance1"], | |
"Distance measure for iteration 1", | |
checker_function=lambda x: x in DISTANCE_MEASURES_ITER1, | |
equate=False, | |
), | |
# distance2 kmer6_6 pctid_kimura Distance measure | |
# kmer20_3 for iterations | |
# kmer20_4 2, 3 ... | |
# kbit20_3 | |
# pctid_kimura | |
# pctid_log | |
_Option( | |
["-distance2", "distance2"], | |
"Distance measure for iteration 2", | |
checker_function=lambda x: x in DISTANCE_MEASURES_ITER2, | |
equate=False, | |
), | |
# gapextend Floating point [1] The gap extend score | |
_Option( | |
["-gapextend", "gapextend"], | |
"Gap extension penalty", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# gapopen Floating point [1] The gap open score | |
# Must be negative. | |
_Option( | |
["-gapopen", "gapopen"], | |
"Gap open score - negative number", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# hydro Integer 5 Window size for | |
# determining whether | |
# a region is | |
# hydrophobic. | |
_Option( | |
["-hydro", "hydro"], | |
"Window size for hydrophobic region", | |
checker_function=lambda x: isinstance(x, int), | |
equate=False, | |
), | |
# hydrofactor Floating point 1.2 Multiplier for gap | |
# open/close | |
# penalties in | |
# hydrophobic regions | |
_Option( | |
["-hydrofactor", "hydrofactor"], | |
"Multiplier for gap penalties in hydrophobic regions", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# log File name None. Log file name | |
# (delete existing | |
# file). | |
_Option(["-log", "log"], "Log file name", filename=True, equate=False), | |
# loga File name None. Log file name | |
# (append to existing | |
# file). | |
_Option( | |
["-loga", "loga"], | |
"Log file name (append to existing file)", | |
filename=True, | |
equate=False, | |
), | |
# matrix File name None. File name for | |
# substitution matrix | |
# in NCBI or WU-BLAST | |
# format. If you | |
# specify your own | |
# matrix, you should | |
# also specify: | |
# -gapopen <g> | |
# -gapextend <e> | |
# -center 0.0 | |
_Option( | |
["-matrix", "matrix"], | |
"path to NCBI or WU-BLAST format protein substitution " | |
"matrix - also set -gapopen, -gapextend and -center", | |
filename=True, | |
equate=False, | |
), | |
# diagbreak Integer 1 Maximum distance | |
# between two | |
# diagonals that | |
# allows them to | |
# merge into one | |
# diagonal. | |
_Option( | |
["-diagbreak", "diagbreak"], | |
"Maximum distance between two diagonals that allows " | |
"them to merge into one diagonal", | |
checker_function=lambda x: isinstance(x, int), | |
equate=False, | |
), | |
_Option( | |
["-maxdiagbreak", "maxdiagbreak"], # deprecated 3.8 | |
"Deprecated in v3.8, use -diagbreak instead.", | |
checker_function=lambda x: isinstance(x, int), | |
equate=False, | |
), | |
# maxhours Floating point None. Maximum time to | |
# run in hours. The | |
# actual time may | |
# exceed requested | |
# limit by a few | |
# minutes. Decimals | |
# are allowed, so 1.5 | |
# means one hour and | |
# 30 minutes. | |
_Option( | |
["-maxhours", "maxhours"], | |
"Maximum time to run in hours", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# maxiters Integer 1, 2 ... 16 Maximum number of | |
# iterations. | |
_Option( | |
["-maxiters", "maxiters"], | |
"Maximum number of iterations", | |
checker_function=lambda x: isinstance(x, int), | |
equate=False, | |
), | |
# maxtrees Integer 1 Maximum number of | |
# new trees to build | |
# in iteration 2. | |
_Option( | |
["-maxtrees", "maxtrees"], | |
"Maximum number of trees to build in iteration 2", | |
checker_function=lambda x: isinstance(x, int), | |
equate=False, | |
), | |
# minbestcolscore Floating point [1] Minimum score a | |
# column must have to | |
# be an anchor. | |
_Option( | |
["-minbestcolscore", "minbestcolscore"], | |
"Minimum score a column must have to be an anchor", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# minsmoothscore Floating point [1] Minimum smoothed | |
# score a column must | |
# have to be an | |
# anchor. | |
_Option( | |
["-minsmoothscore", "minsmoothscore"], | |
"Minimum smoothed score a column must have to be an anchor", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# objscore sp spm Objective score | |
# ps used by tree | |
# dp dependent | |
# xp refinement. | |
# spf sp=sum-of-pairs | |
# spm score. (dimer | |
# approximation) | |
# spm=sp for < 100 | |
# seqs, otherwise spf | |
# dp=dynamic | |
# programming score. | |
# ps=average profile- | |
# sequence score. | |
# xp=cross profile | |
# score. | |
_Option( | |
["-objscore", "objscore"], | |
"Objective score used by tree dependent refinement", | |
checker_function=lambda x: x in OBJECTIVE_SCORES, | |
equate=False, | |
), | |
# refinewindow Integer 200 Length of window | |
# for -refinew. | |
_Option( | |
["-refinewindow", "refinewindow"], | |
"Length of window for -refinew", | |
checker_function=lambda x: isinstance(x, int), | |
equate=False, | |
), | |
# root1 pseudo pseudo Method used to root | |
_Option( | |
["-root1", "root1"], | |
"Method used to root tree in iteration 1", | |
checker_function=lambda x: x in TREE_ROOT_METHODS, | |
equate=False, | |
), | |
# root2 midlongestspan tree; root1 is | |
# minavgleafdist used in iteration 1 | |
# and 2, root2 in | |
# later iterations. | |
_Option( | |
["-root2", "root2"], | |
"Method used to root tree in iteration 2", | |
checker_function=lambda x: x in TREE_ROOT_METHODS, | |
equate=False, | |
), | |
# scorefile File name None File name where to | |
# write a score file. | |
# This contains one | |
# line for each column | |
# in the alignment. | |
# The line contains | |
# the letters in the | |
# column followed by | |
# the average BLOSUM62 | |
# score over pairs of | |
# letters in the | |
# column. | |
_Option( | |
["-scorefile", "scorefile"], | |
"Score file name, contains one line for each column" | |
" in the alignment with average BLOSUM62 score", | |
filename=True, | |
equate=False, | |
), | |
# seqtype protein auto Sequence type. | |
# dna (MUSCLE version > 3.8) | |
# rna (MUSCLE version > 3.8) | |
# auto | |
# nucleo (only valid for MUSCLE versions < 3.8) | |
_Option( | |
["-seqtype", "seqtype"], | |
"Sequence type", | |
checker_function=lambda x: x in SEQUENCE_TYPES, | |
equate=False, | |
), | |
# smoothscoreceil Floating point [1] Maximum value of | |
# column score for | |
# smoothing purposes. | |
_Option( | |
["-smoothscoreceil", "smoothscoreceil"], | |
"Maximum value of column score for smoothing", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# smoothwindow Integer 7 Window used for | |
# anchor column | |
# smoothing. | |
_Option( | |
["-smoothwindow", "smoothwindow"], | |
"Window used for anchor column smoothing", | |
checker_function=lambda x: isinstance(x, int), | |
equate=False, | |
), | |
# spscore File name Compute SP | |
# objective score of | |
# multiple alignment. | |
_Option( | |
["-spscore", "spscore"], | |
"Compute SP objective score of multiple alignment", | |
filename=True, | |
equate=False, | |
), | |
# SUEFF Floating point value 0.1 Constant used in | |
# between 0 and 1. UPGMB clustering. | |
# Determines the | |
# relative fraction | |
# of average linkage | |
# (SUEFF) vs. nearest | |
# neighbor linkage | |
# (1 SUEFF). | |
_Option( | |
["-sueff", "sueff"], | |
"Constant used in UPGMB clustering", | |
checker_function=lambda x: isinstance(x, float), | |
equate=False, | |
), | |
# tree1 File name None Save tree | |
_Option( | |
["-tree1", "tree1"], "Save Newick tree from iteration 1", equate=False | |
), | |
# tree2 first or second | |
# iteration to given | |
# file in Newick | |
# (Phylip-compatible) | |
# format. | |
_Option( | |
["-tree2", "tree2"], "Save Newick tree from iteration 2", equate=False | |
), | |
# usetree File name None Use given tree as | |
# guide tree. Must by | |
# in Newick | |
# (Phyip-compatible) | |
# format. | |
_Option( | |
["-usetree", "usetree"], | |
"Use given Newick tree as guide tree", | |
filename=True, | |
equate=False, | |
), | |
# weight1 none clustalw Sequence weighting | |
_Option( | |
["-weight1", "weight1"], | |
"Weighting scheme used in iteration 1", | |
checker_function=lambda x: x in WEIGHTING_SCHEMES, | |
equate=False, | |
), | |
# weight2 henikoff scheme. | |
# henikoffpb weight1 is used in | |
# gsc iterations 1 and 2. | |
# clustalw weight2 is used for | |
# threeway tree-dependent | |
# refinement. | |
# none=all sequences | |
# have equal weight. | |
# henikoff=Henikoff & | |
# Henikoff weighting | |
# scheme. | |
# henikoffpb=Modified | |
# Henikoff scheme as | |
# used in PSI-BLAST. | |
# clustalw=CLUSTALW | |
# method. | |
# threeway=Gotoh | |
# three-way method. | |
_Option( | |
["-weight2", "weight2"], | |
"Weighting scheme used in iteration 2", | |
checker_function=lambda x: x in WEIGHTING_SCHEMES, | |
equate=False, | |
), | |
# ################### FORMATS #################################### | |
# Multiple formats can be specified on the command line | |
# If -msf appears it will be used regardless of other formats | |
# specified. If -clw appears (and not -msf), clustalw format will | |
# be used regardless of other formats specified. If both -clw and | |
# -clwstrict are specified -clwstrict will be used regardless of | |
# other formats specified. If -fasta is specified and not -msf, | |
# -clw, or clwstrict, fasta will be used. If -fasta and -html are | |
# specified -fasta will be used. Only if -html is specified alone | |
# will html be used. I kid ye not. | |
# clw no Write output in CLUSTALW format | |
# (default is FASTA). | |
_Switch( | |
["-clw", "clw"], | |
"Write output in CLUSTALW format (with a MUSCLE header)", | |
), | |
# clwstrict no Write output in CLUSTALW format with | |
# the "CLUSTAL W (1.81)" header rather | |
# than the MUSCLE version. This is | |
# useful when a post-processing step is | |
# picky about the file header. | |
_Switch( | |
["-clwstrict", "clwstrict"], | |
"Write output in CLUSTALW format with version 1.81 header", | |
), | |
# fasta yes Write output in FASTA format. | |
# Alternatives include clw, | |
# clwstrict, msf and html. | |
_Switch(["-fasta", "fasta"], "Write output in FASTA format"), | |
# html no Write output in HTML format (default | |
# is FASTA). | |
_Switch(["-html", "html"], "Write output in HTML format"), | |
# msf no Write output in MSF format (default | |
# is FASTA). | |
_Switch(["-msf", "msf"], "Write output in MSF format"), | |
# Phylip interleaved - undocumented as of 3.7 | |
_Switch(["-phyi", "phyi"], "Write output in PHYLIP interleaved format"), | |
# Phylip sequential - undocumented as of 3.7 | |
_Switch(["-phys", "phys"], "Write output in PHYLIP sequential format"), | |
# ################# Additional specified output files ######### | |
_Option( | |
["-phyiout", "phyiout"], | |
"Write PHYLIP interleaved output to specified filename", | |
filename=True, | |
equate=False, | |
), | |
_Option( | |
["-physout", "physout"], | |
"Write PHYLIP sequential format to specified filename", | |
filename=True, | |
equate=False, | |
), | |
_Option( | |
["-htmlout", "htmlout"], | |
"Write HTML output to specified filename", | |
filename=True, | |
equate=False, | |
), | |
_Option( | |
["-clwout", "clwout"], | |
"Write CLUSTALW output (with MUSCLE header) to specified filename", | |
filename=True, | |
equate=False, | |
), | |
_Option( | |
["-clwstrictout", "clwstrictout"], | |
"Write CLUSTALW output (with version 1.81 header) to " | |
"specified filename", | |
filename=True, | |
equate=False, | |
), | |
_Option( | |
["-msfout", "msfout"], | |
"Write MSF format output to specified filename", | |
filename=True, | |
equate=False, | |
), | |
_Option( | |
["-fastaout", "fastaout"], | |
"Write FASTA format output to specified filename", | |
filename=True, | |
equate=False, | |
), | |
# ############# END FORMATS ################################### | |
# anchors yes Use anchor optimization in tree | |
# dependent refinement iterations. | |
_Switch( | |
["-anchors", "anchors"], | |
"Use anchor optimisation in tree dependent refinement iterations", | |
), | |
# noanchors no Disable anchor optimization. Default | |
# is anchors. | |
_Switch( | |
["-noanchors", "noanchors"], | |
"Do not use anchor optimisation in tree dependent " | |
"refinement iterations", | |
), | |
# brenner no Use Steven Brenner's method for | |
# computing the root alignment. | |
_Switch( | |
["-brenner", "brenner"], "Use Steve Brenner's root alignment method" | |
), | |
# cluster no Perform fast clustering of input | |
# sequences. Use the tree1 option to | |
# save the tree. | |
_Switch( | |
["-cluster", "cluster"], | |
"Perform fast clustering of input sequences, " | |
"use -tree1 to save tree", | |
), | |
# dimer no Use dimer approximation for the | |
# SP score (faster, less accurate). | |
_Switch( | |
["-dimer", "dimer"], | |
"Use faster (slightly less accurate) dimer approximation" | |
"for the SP score", | |
), | |
# group yes Group similar sequences together | |
# in the output. This is the default. | |
# See also stable. | |
_Switch(["-group", "group"], "Group similar sequences in output"), | |
# ############# log-expectation profile score #################### | |
# One of either -le, -sp, or -sv | |
# | |
# According to the doc, spn is default and the only option for | |
# nucleotides: this doesn't appear to be true. -le, -sp, and -sv | |
# can be used and produce numerically different logs | |
# (what is going on?) | |
# | |
# spn fails on proteins | |
# le maybe Use log-expectation profile score | |
# (VTML240). Alternatives are to use sp | |
# or sv. This is the default for amino | |
# acid sequences. | |
_Switch(["-le", "le"], "Use log-expectation profile score (VTML240)"), | |
# sv no Use sum-of-pairs profile score | |
# (VTML240). Default is le. | |
_Switch(["-sv", "sv"], "Use sum-of-pairs profile score (VTML240)"), | |
# sp no Use sum-of-pairs protein profile | |
# score (PAM200). Default is le. | |
_Switch(["-sp", "sp"], "Use sum-of-pairs protein profile score (PAM200)"), | |
# spn maybe Use sum-of-pairs nucleotide profile | |
# score (BLASTZ parameters). This is | |
# the only option for nucleotides, | |
# and is therefore the default. | |
_Switch( | |
["-spn", "spn"], "Use sum-of-pairs protein nucleotide profile score" | |
), | |
# ########## END log-expectation profile score ################### | |
# quiet no Do not display progress messages. | |
_Switch(["-quiet", "quiet"], "Do not display progress messages"), | |
# refine no Input file is already aligned, skip | |
# first two iterations and begin tree | |
# dependent refinement. | |
_Switch(["-refine", "refine"], "Only do tree dependent refinement"), | |
# refinew no Refine an alignment by dividing it | |
# into non-overlapping windows and | |
# re-aligning each window. Typically | |
# used for whole-genome nucleotide | |
# alignments. | |
_Switch( | |
["-refinew", "refinew"], | |
"Only do tree dependent refinement using sliding window approach", | |
), | |
# core yes in muscle, Do not catch exceptions. | |
# no in muscled. | |
_Switch(["-core", "core"], "Do not catch exceptions"), | |
# nocore no in muscle, Catch exceptions and give an | |
# yes in muscled. error message if possible. | |
_Switch(["-nocore", "nocore"], "Catch exceptions"), | |
# stable no Preserve input order of sequences | |
# in output file. Default is to group | |
# sequences by similarity (group). | |
_Switch( | |
["-stable", "stable"], | |
"Do not group similar sequences in output (not supported in v3.8)", | |
), | |
# termgaps4 yes Use 4-way test for treatment of | |
# terminal gaps. | |
# (Cannot be disabled in this version). | |
# | |
# termgapsfull no Terminal gaps penalized with | |
# full penalty. [1] Not fully | |
# supported in this version | |
# | |
# termgapshalf yes Terminal gaps penalized with | |
# half penalty. [1] Not fully | |
# supported in this version | |
# | |
# termgapshalflonger no Terminal gaps penalized with | |
# half penalty if gap relative | |
# to longer sequence, otherwise with | |
# full penalty. [1] Not fully | |
# supported in this version | |
# | |
# verbose no Write parameter settings and | |
# progress messages to log file. | |
_Switch(["-verbose", "verbose"], "Write parameter settings and progress"), | |
# version no Write version string to | |
# stdout and exit | |
_Switch(["-version", "version"], "Write version string to stdout and exit"), | |
] | |
AbstractCommandline.__init__(self, cmd, **kwargs) | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest() | |