Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /motifs /applications /_xxmotif.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

10.4 kB

	# Copyright 2012 by Christian Brueffer. All rights reserved.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.

	"""Command line wrapper for the motif finding program XXmotif."""


	import os
	from Bio.Application import AbstractCommandline, _Option, _Switch, _Argument


	class XXmotifCommandline(AbstractCommandline):
	"""Command line wrapper for XXmotif.

	http://xxmotif.genzentrum.lmu.de/

	Notes
	-----
	Last checked against version: 1.3

	References
	----------
	Luehr S, Hartmann H, and Söding J. The XXmotif web server for eXhaustive,
	weight matriX-based motif discovery in nucleotide sequences,
	Nucleic Acids Res. 40: W104-W109 (2012).

	Hartmann H, Guthoehrlein EW, Siebert M., Luehr S, and Söding J. P-value
	based regulatory motif discovery using positional weight matrices,
	Genome Res. 23: 181–194 (2013)

	Examples
	--------
	>>> from Bio.motifs.applications import XXmotifCommandline
	>>> out_dir = "results"
	>>> in_file = "sequences.fasta"
	>>> xxmotif_cline = XXmotifCommandline(outdir=out_dir, seqfile=in_file, revcomp=True)
	>>> print(xxmotif_cline)
	XXmotif results sequences.fasta --revcomp

	You would typically run the command line with xxmotif_cline() or via
	the Python subprocess module, as described in the Biopython tutorial.

	"""

	def __init__(self, cmd="XXmotif", **kwargs):
	"""Initialize the class."""
	# order of parameters is the same as in XXmotif --help
	_valid_alphabet = set("ACGTNX")

	self.parameters = [
	_Argument(
	["outdir", "OUTDIR"],
	"output directory for all results",
	filename=True,
	is_required=True,
	# XXmotif currently does not accept spaces in the outdir name
	checker_function=lambda x: " " not in x,
	),
	_Argument(
	["seqfile", "SEQFILE"],
	"file name with sequences from positive set in FASTA format",
	filename=True,
	is_required=True,
	# XXmotif currently only accepts a pure filename
	checker_function=lambda x: os.path.split(x)[0] == "",
	),
	# Options
	_Option(
	["--negSet", "negSet", "NEGSET", "negset"],
	"sequence set which has to be used as a reference set",
	filename=True,
	equate=False,
	),
	_Switch(
	["--zoops", "ZOOPS", "zoops"],
	"use zero-or-one occurrence per sequence model (DEFAULT)",
	),
	_Switch(
	["--mops", "MOPS", "mops"], "use multiple occurrence per sequence model"
	),
	_Switch(
	["--oops", "OOPS", "oops"], "use one occurrence per sequence model"
	),
	_Switch(
	["--revcomp", "REVCOMP", "revcomp"],
	"search in reverse complement of sequences as well (DEFAULT: NO)",
	),
	_Option(
	[
	"--background-model-order",
	"background-model-order",
	"BACKGROUND-MODEL-ORDER",
	"background_model_order",
	],
	"order of background distribution (DEFAULT: 2, 8(--negset) )",
	checker_function=lambda x: isinstance(x, int),
	equate=False,
	),
	_Option(
	["--pseudo", "PSEUDO", "pseudo"],
	"percentage of pseudocounts used (DEFAULT: 10)",
	checker_function=lambda x: isinstance(x, int),
	equate=False,
	),
	_Option(
	["-g", "--gaps", "GAPS", "gaps"],
	"maximum number of gaps used for start seeds [0-3] (DEFAULT: 0)",
	checker_function=lambda x: x in [0 - 3],
	equate=False,
	),
	_Option(
	["--type", "TYPE", "type"],
	"defines what kind of start seeds are used (DEFAULT: ALL)"
	"possible types: ALL, FIVEMERS, PALINDROME, TANDEM, NOPALINDROME, NOTANDEM",
	checker_function=lambda x: x
	in [
	"ALL",
	"all",
	"FIVEMERS",
	"fivemers",
	"PALINDROME",
	"palindrome",
	"TANDEM",
	"tandem",
	"NOPALINDROME",
	"nopalindrome",
	"NOTANDEM",
	"notandem",
	],
	equate=False,
	),
	_Option(
	[
	"--merge-motif-threshold",
	"merge-motif-threshold",
	"MERGE-MOTIF-THRESHOLD",
	"merge_motif_threshold",
	],
	"defines the similarity threshold for merging motifs (DEFAULT: HIGH)"
	"possible modes: LOW, MEDIUM, HIGH",
	checker_function=lambda x: x
	in ["LOW", "low", "MEDIUM", "medium", "HIGH", "high"],
	equate=False,
	),
	_Switch(
	[
	"--no-pwm-length-optimization",
	"no-pwm-length-optimization",
	"NO-PWM-LENGTH-OPTIMIZATION",
	"no_pwm_length_optimization",
	],
	"do not optimize length during iterations (runtime advantages)",
	),
	_Option(
	[
	"--max-match-positions",
	"max-match-positions",
	"MAX-MATCH-POSITIONS",
	"max_match_positions",
	],
	"max number of positions per motif (DEFAULT: 17, higher values will lead to very long runtimes)",
	checker_function=lambda x: isinstance(x, int),
	equate=False,
	),
	_Switch(
	["--batch", "BATCH", "batch"],
	"suppress progress bars (reduce output size for batch jobs)",
	),
	_Option(
	["--maxPosSetSize", "maxPosSetSize", "MAXPOSSETSIZE", "maxpossetsize"],
	"maximum number of sequences from the positive set used [DEFAULT: all]",
	checker_function=lambda x: isinstance(x, int),
	equate=False,
	),
	# does not make sense in biopython
	# _Switch(["--help", "help", "HELP"],
	# "print this help page"),
	_Option(
	["--trackedMotif", "trackedMotif", "TRACKEDMOTIF", "trackedmotif"],
	"inspect extensions and refinement of a given seed (DEFAULT: not used)",
	checker_function=lambda x: any((c in _valid_alphabet) for c in x),
	equate=False,
	),
	# Using conservation information
	_Option(
	["--format", "FORMAT", "format"],
	"defines what kind of format the input sequences have (DEFAULT: FASTA)",
	checker_function=lambda x: x in ["FASTA", "fasta", "MFASTA", "mfasta"],
	equate=False,
	),
	_Option(
	[
	"--maxMultipleSequences",
	"maxMultipleSequences",
	"MAXMULTIPLESEQUENCES",
	"maxmultiplesequences",
	],
	"maximum number of sequences used in an alignment [DEFAULT: all]",
	checker_function=lambda x: isinstance(x, int),
	equate=False,
	),
	# Using localization information
	_Switch(
	["--localization", "LOCALIZATION", "localization"],
	"use localization information to calculate combined P-values"
	"(sequences should have all the same length)",
	),
	_Option(
	["--downstream", "DOWNSTREAM", "downstream"],
	"number of residues in positive set downstream of anchor point (DEFAULT: 0)",
	checker_function=lambda x: isinstance(x, int),
	equate=False,
	),
	# Start with self defined motif
	_Option(
	["-m", "--startMotif", "startMotif", "STARTMOTIF", "startmotif"],
	"Start motif (IUPAC characters)",
	checker_function=lambda x: any((c in _valid_alphabet) for c in x),
	equate=False,
	),
	_Option(
	["-p", "--profileFile", "profileFile", "PROFILEFILE", "profilefile"],
	"profile file",
	filename=True,
	equate=False,
	),
	_Option(
	["--startRegion", "startRegion", "STARTREGION", "startregion"],
	"expected start position for motif occurrences relative to anchor point (--localization)",
	checker_function=lambda x: isinstance(x, int),
	equate=False,
	),
	_Option(
	["--endRegion", "endRegion", "ENDREGION", "endregion"],
	"expected end position for motif occurrences relative to anchor point (--localization)",
	checker_function=lambda x: isinstance(x, int),
	equate=False,
	),
	# XXmotif wrapper options
	_Switch(
	["--XXmasker", "masker"],
	"mask the input sequences for homology, repeats and low complexity regions",
	),
	_Switch(
	["--XXmasker-pos", "maskerpos"],
	"mask only the positive set for homology, repeats and low complexity regions",
	),
	_Switch(
	["--no-graphics", "nographics"], "run XXmotif without graphical output"
	),
	]
	AbstractCommandline.__init__(self, cmd, **kwargs)


	if __name__ == "__main__":
	from Bio._utils import run_doctest

	run_doctest()