Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

File size: 10,382 Bytes

b7731cd

# Copyright 2012 by Christian Brueffer.  All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.

"""Command line wrapper for the motif finding program XXmotif."""


import os
from Bio.Application import AbstractCommandline, _Option, _Switch, _Argument


class XXmotifCommandline(AbstractCommandline):
    """Command line wrapper for XXmotif.

    http://xxmotif.genzentrum.lmu.de/

    Notes
    -----
    Last checked against version: 1.3

    References
    ----------
    Luehr S, Hartmann H, and Söding J. The XXmotif web server for eXhaustive,
    weight matriX-based motif discovery in nucleotide sequences,
    Nucleic Acids Res. 40: W104-W109 (2012).

    Hartmann H, Guthoehrlein EW, Siebert M., Luehr S, and Söding J. P-value
    based regulatory motif discovery using positional weight matrices,
    Genome Res. 23: 181–194 (2013)

    Examples
    --------
    >>> from Bio.motifs.applications import XXmotifCommandline
    >>> out_dir = "results"
    >>> in_file = "sequences.fasta"
    >>> xxmotif_cline = XXmotifCommandline(outdir=out_dir, seqfile=in_file, revcomp=True)
    >>> print(xxmotif_cline)
    XXmotif results sequences.fasta --revcomp

    You would typically run the command line with xxmotif_cline() or via
    the Python subprocess module, as described in the Biopython tutorial.

    """

    def __init__(self, cmd="XXmotif", **kwargs):
        """Initialize the class."""
        # order of parameters is the same as in XXmotif --help
        _valid_alphabet = set("ACGTNX")

        self.parameters = [
            _Argument(
                ["outdir", "OUTDIR"],
                "output directory for all results",
                filename=True,
                is_required=True,
                # XXmotif currently does not accept spaces in the outdir name
                checker_function=lambda x: " " not in x,
            ),
            _Argument(
                ["seqfile", "SEQFILE"],
                "file name with sequences from positive set in FASTA format",
                filename=True,
                is_required=True,
                # XXmotif currently only accepts a pure filename
                checker_function=lambda x: os.path.split(x)[0] == "",
            ),
            # Options
            _Option(
                ["--negSet", "negSet", "NEGSET", "negset"],
                "sequence set which has to be used as a reference set",
                filename=True,
                equate=False,
            ),
            _Switch(
                ["--zoops", "ZOOPS", "zoops"],
                "use zero-or-one occurrence per sequence model (DEFAULT)",
            ),
            _Switch(
                ["--mops", "MOPS", "mops"], "use multiple occurrence per sequence model"
            ),
            _Switch(
                ["--oops", "OOPS", "oops"], "use one occurrence per sequence model"
            ),
            _Switch(
                ["--revcomp", "REVCOMP", "revcomp"],
                "search in reverse complement of sequences as well (DEFAULT: NO)",
            ),
            _Option(
                [
                    "--background-model-order",
                    "background-model-order",
                    "BACKGROUND-MODEL-ORDER",
                    "background_model_order",
                ],
                "order of background distribution (DEFAULT: 2, 8(--negset) )",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            _Option(
                ["--pseudo", "PSEUDO", "pseudo"],
                "percentage of pseudocounts used (DEFAULT: 10)",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            _Option(
                ["-g", "--gaps", "GAPS", "gaps"],
                "maximum number of gaps used for start seeds [0-3] (DEFAULT: 0)",
                checker_function=lambda x: x in [0 - 3],
                equate=False,
            ),
            _Option(
                ["--type", "TYPE", "type"],
                "defines what kind of start seeds are used (DEFAULT: ALL)"
                "possible types: ALL, FIVEMERS, PALINDROME, TANDEM, NOPALINDROME, NOTANDEM",
                checker_function=lambda x: x
                in [
                    "ALL",
                    "all",
                    "FIVEMERS",
                    "fivemers",
                    "PALINDROME",
                    "palindrome",
                    "TANDEM",
                    "tandem",
                    "NOPALINDROME",
                    "nopalindrome",
                    "NOTANDEM",
                    "notandem",
                ],
                equate=False,
            ),
            _Option(
                [
                    "--merge-motif-threshold",
                    "merge-motif-threshold",
                    "MERGE-MOTIF-THRESHOLD",
                    "merge_motif_threshold",
                ],
                "defines the similarity threshold for merging motifs (DEFAULT: HIGH)"
                "possible modes: LOW, MEDIUM, HIGH",
                checker_function=lambda x: x
                in ["LOW", "low", "MEDIUM", "medium", "HIGH", "high"],
                equate=False,
            ),
            _Switch(
                [
                    "--no-pwm-length-optimization",
                    "no-pwm-length-optimization",
                    "NO-PWM-LENGTH-OPTIMIZATION",
                    "no_pwm_length_optimization",
                ],
                "do not optimize length during iterations (runtime advantages)",
            ),
            _Option(
                [
                    "--max-match-positions",
                    "max-match-positions",
                    "MAX-MATCH-POSITIONS",
                    "max_match_positions",
                ],
                "max number of positions per motif (DEFAULT: 17, higher values will lead to very long runtimes)",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            _Switch(
                ["--batch", "BATCH", "batch"],
                "suppress progress bars (reduce output size for batch jobs)",
            ),
            _Option(
                ["--maxPosSetSize", "maxPosSetSize", "MAXPOSSETSIZE", "maxpossetsize"],
                "maximum number of sequences from the positive set used [DEFAULT: all]",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # does not make sense in biopython
            # _Switch(["--help", "help", "HELP"],
            #         "print this help page"),
            _Option(
                ["--trackedMotif", "trackedMotif", "TRACKEDMOTIF", "trackedmotif"],
                "inspect extensions and refinement of a given seed (DEFAULT: not used)",
                checker_function=lambda x: any((c in _valid_alphabet) for c in x),
                equate=False,
            ),
            # Using conservation information
            _Option(
                ["--format", "FORMAT", "format"],
                "defines what kind of format the input sequences have (DEFAULT: FASTA)",
                checker_function=lambda x: x in ["FASTA", "fasta", "MFASTA", "mfasta"],
                equate=False,
            ),
            _Option(
                [
                    "--maxMultipleSequences",
                    "maxMultipleSequences",
                    "MAXMULTIPLESEQUENCES",
                    "maxmultiplesequences",
                ],
                "maximum number of sequences used in an alignment [DEFAULT: all]",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # Using localization information
            _Switch(
                ["--localization", "LOCALIZATION", "localization"],
                "use localization information to calculate combined P-values"
                "(sequences should have all the same length)",
            ),
            _Option(
                ["--downstream", "DOWNSTREAM", "downstream"],
                "number of residues in positive set downstream of anchor point (DEFAULT: 0)",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # Start with self defined motif
            _Option(
                ["-m", "--startMotif", "startMotif", "STARTMOTIF", "startmotif"],
                "Start motif (IUPAC characters)",
                checker_function=lambda x: any((c in _valid_alphabet) for c in x),
                equate=False,
            ),
            _Option(
                ["-p", "--profileFile", "profileFile", "PROFILEFILE", "profilefile"],
                "profile file",
                filename=True,
                equate=False,
            ),
            _Option(
                ["--startRegion", "startRegion", "STARTREGION", "startregion"],
                "expected start position for motif occurrences relative to anchor point (--localization)",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            _Option(
                ["--endRegion", "endRegion", "ENDREGION", "endregion"],
                "expected end position for motif occurrences relative to anchor point (--localization)",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # XXmotif wrapper options
            _Switch(
                ["--XXmasker", "masker"],
                "mask the input sequences for homology, repeats and low complexity regions",
            ),
            _Switch(
                ["--XXmasker-pos", "maskerpos"],
                "mask only the positive set for homology, repeats and low complexity regions",
            ),
            _Switch(
                ["--no-graphics", "nographics"], "run XXmotif without graphical output"
            ),
        ]
        AbstractCommandline.__init__(self, cmd, **kwargs)


if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()