Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

File size: 20,572 Bytes

b7731cd

# Copyright 2013 by Michiel de Hoon.  All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.

"""Support for various forms of sequence motif matrices.

Implementation of frequency (count) matrices, position-weight matrices,
and position-specific scoring matrices.
"""

import math
import numbers

try:
    import numpy as np
except ImportError:
    from Bio import MissingPythonDependencyError

    raise MissingPythonDependencyError(
        "Install NumPy if you want to use Bio.motifs.matrix."
    )

from Bio.Seq import Seq

from . import _pwm


class GenericPositionMatrix(dict):
    """Base class for the support of position matrix operations."""

    def __init__(self, alphabet, values):
        """Initialize the class."""
        self.length = None
        for letter in alphabet:
            if self.length is None:
                self.length = len(values[letter])
            elif self.length != len(values[letter]):
                raise Exception("data has inconsistent lengths")
            self[letter] = list(values[letter])
        self.alphabet = alphabet

    def __str__(self):
        """Return a string containing nucleotides and counts of the alphabet in the Matrix."""
        words = ["%6d" % i for i in range(self.length)]
        line = "   " + " ".join(words)
        lines = [line]
        for letter in self.alphabet:
            words = ["%6.2f" % value for value in self[letter]]
            line = "%c: " % letter + " ".join(words)
            lines.append(line)
        text = "\n".join(lines) + "\n"
        return text

    def __getitem__(self, key):
        """Return the position matrix of index key."""
        if isinstance(key, tuple):
            if len(key) == 2:
                key1, key2 = key
                if isinstance(key1, slice):
                    start1, stop1, stride1 = key1.indices(len(self.alphabet))
                    indices1 = range(start1, stop1, stride1)
                    letters1 = [self.alphabet[i] for i in indices1]
                    dim1 = 2
                elif isinstance(key1, numbers.Integral):
                    letter1 = self.alphabet[key1]
                    dim1 = 1
                elif isinstance(key1, tuple):
                    letters1 = [self.alphabet[i] for i in key1]
                    dim1 = 2
                elif isinstance(key1, str):
                    if len(key1) == 1:
                        letter1 = key1
                        dim1 = 1
                    else:
                        raise KeyError(key1)
                else:
                    raise KeyError("Cannot understand key %s" % key1)
                if isinstance(key2, slice):
                    start2, stop2, stride2 = key2.indices(self.length)
                    indices2 = range(start2, stop2, stride2)
                    dim2 = 2
                elif isinstance(key2, numbers.Integral):
                    index2 = key2
                    dim2 = 1
                else:
                    raise KeyError("Cannot understand key %s" % key2)
                if dim1 == 1 and dim2 == 1:
                    return dict.__getitem__(self, letter1)[index2]
                elif dim1 == 1 and dim2 == 2:
                    values = dict.__getitem__(self, letter1)
                    return tuple(values[index2] for index2 in indices2)
                elif dim1 == 2 and dim2 == 1:
                    d = {}
                    for letter1 in letters1:
                        d[letter1] = dict.__getitem__(self, letter1)[index2]
                    return d
                else:
                    d = {}
                    for letter1 in letters1:
                        values = dict.__getitem__(self, letter1)
                        d[letter1] = [values[_] for _ in indices2]
                    if sorted(letters1) == self.alphabet:
                        return self.__class__(self.alphabet, d)
                    else:
                        return d
            elif len(key) == 1:
                key = key[0]
            else:
                raise KeyError("keys should be 1- or 2-dimensional")
        if isinstance(key, slice):
            start, stop, stride = key.indices(len(self.alphabet))
            indices = range(start, stop, stride)
            letters = [self.alphabet[i] for i in indices]
            dim = 2
        elif isinstance(key, numbers.Integral):
            letter = self.alphabet[key]
            dim = 1
        elif isinstance(key, tuple):
            letters = [self.alphabet[i] for i in key]
            dim = 2
        elif isinstance(key, str):
            if len(key) == 1:
                letter = key
                dim = 1
            else:
                raise KeyError(key)
        else:
            raise KeyError("Cannot understand key %s" % key)
        if dim == 1:
            return dict.__getitem__(self, letter)
        elif dim == 2:
            d = {}
            for letter in letters:
                d[letter] = dict.__getitem__(self, letter)
            return d
        else:
            raise RuntimeError("Should not get here")

    @property
    def consensus(self):
        """Return the consensus sequence."""
        sequence = ""
        for i in range(self.length):
            maximum = -math.inf
            for letter in self.alphabet:
                count = self[letter][i]
                if count > maximum:
                    maximum = count
                    sequence_letter = letter
            sequence += sequence_letter
        return Seq(sequence)

    @property
    def anticonsensus(self):
        """Return the anticonsensus sequence."""
        sequence = ""
        for i in range(self.length):
            minimum = math.inf
            for letter in self.alphabet:
                count = self[letter][i]
                if count < minimum:
                    minimum = count
                    sequence_letter = letter
            sequence += sequence_letter
        return Seq(sequence)

    @property
    def degenerate_consensus(self):
        """Return the degenerate consensus sequence."""
        # Following the rules adapted from
        # D. R. Cavener: "Comparison of the consensus sequence flanking
        # translational start sites in Drosophila and vertebrates."
        # Nucleic Acids Research 15(4): 1353-1361. (1987).
        # The same rules are used by TRANSFAC.
        degenerate_nucleotide = {
            "A": "A",
            "C": "C",
            "G": "G",
            "T": "T",
            "AC": "M",
            "AG": "R",
            "AT": "W",
            "CG": "S",
            "CT": "Y",
            "GT": "K",
            "ACG": "V",
            "ACT": "H",
            "AGT": "D",
            "CGT": "B",
            "ACGT": "N",
        }
        sequence = ""
        for i in range(self.length):

            def get(nucleotide):
                return self[nucleotide][i]  # noqa: B023

            nucleotides = sorted(self, key=get, reverse=True)
            counts = [self[c][i] for c in nucleotides]
            # Follow the Cavener rules:
            if counts[0] > sum(counts[1:]) and counts[0] > 2 * counts[1]:
                key = nucleotides[0]
            elif 4 * sum(counts[:2]) > 3 * sum(counts):
                key = "".join(sorted(nucleotides[:2]))
            elif counts[3] == 0:
                key = "".join(sorted(nucleotides[:3]))
            else:
                key = "ACGT"
            nucleotide = degenerate_nucleotide.get(key, key)
            sequence += nucleotide
        return Seq(sequence)

    @property
    def gc_content(self):
        """Compute the fraction GC content."""
        alphabet = self.alphabet
        gc_total = 0.0
        total = 0.0
        for i in range(self.length):
            for letter in alphabet:
                if letter in "CG":
                    gc_total += self[letter][i]
                total += self[letter][i]
        return gc_total / total

    def reverse_complement(self):
        """Compute reverse complement."""
        values = {}
        if self.alphabet == "ACGU":
            values["A"] = self["U"][::-1]
            values["U"] = self["A"][::-1]
        else:
            values["A"] = self["T"][::-1]
            values["T"] = self["A"][::-1]
        values["G"] = self["C"][::-1]
        values["C"] = self["G"][::-1]
        alphabet = self.alphabet
        return self.__class__(alphabet, values)


class FrequencyPositionMatrix(GenericPositionMatrix):
    """Class for the support of frequency calculations on the Position Matrix."""

    def normalize(self, pseudocounts=None):
        """Create and return a position-weight matrix by normalizing the counts matrix.

        If pseudocounts is None (default), no pseudocounts are added
        to the counts.

        If pseudocounts is a number, it is added to the counts before
        calculating the position-weight matrix.

        Alternatively, the pseudocounts can be a dictionary with a key
        for each letter in the alphabet associated with the motif.
        """
        counts = {}
        if pseudocounts is None:
            for letter in self.alphabet:
                counts[letter] = [0.0] * self.length
        elif isinstance(pseudocounts, dict):
            for letter in self.alphabet:
                counts[letter] = [float(pseudocounts[letter])] * self.length
        else:
            for letter in self.alphabet:
                counts[letter] = [float(pseudocounts)] * self.length
        for i in range(self.length):
            for letter in self.alphabet:
                counts[letter][i] += self[letter][i]
        # Actual normalization is done in the PositionWeightMatrix initializer
        return PositionWeightMatrix(self.alphabet, counts)


class PositionWeightMatrix(GenericPositionMatrix):
    """Class for the support of weight calculations on the Position Matrix."""

    def __init__(self, alphabet, counts):
        """Initialize the class."""
        GenericPositionMatrix.__init__(self, alphabet, counts)
        for i in range(self.length):
            total = sum(self[letter][i] for letter in alphabet)
            for letter in alphabet:
                self[letter][i] /= total
        for letter in alphabet:
            self[letter] = tuple(self[letter])

    def log_odds(self, background=None):
        """Return the Position-Specific Scoring Matrix.

        The Position-Specific Scoring Matrix (PSSM) contains the log-odds
        scores computed from the probability matrix and the background
        probabilities. If the background is None, a uniform background
        distribution is assumed.
        """
        values = {}
        alphabet = self.alphabet
        if background is None:
            background = dict.fromkeys(self.alphabet, 1.0)
        else:
            background = dict(background)
        total = sum(background.values())
        for letter in alphabet:
            background[letter] /= total
            values[letter] = []
        for i in range(self.length):
            for letter in alphabet:
                b = background[letter]
                if b > 0:
                    p = self[letter][i]
                    if p > 0:
                        logodds = math.log(p / b, 2)
                    else:
                        logodds = -math.inf
                else:
                    p = self[letter][i]
                    if p > 0:
                        logodds = math.inf
                    else:
                        logodds = math.nan
                values[letter].append(logodds)
        pssm = PositionSpecificScoringMatrix(alphabet, values)
        return pssm


class PositionSpecificScoringMatrix(GenericPositionMatrix):
    """Class for the support of Position Specific Scoring Matrix calculations."""

    def calculate(self, sequence):
        """Return the PWM score for a given sequence for all positions.

        Notes:
         - the sequence can only be a DNA sequence
         - the search is performed only on one strand
         - if the sequence and the motif have the same length, a single
           number is returned
         - otherwise, the result is a one-dimensional numpy array

        """
        # TODO - Code itself tolerates ambiguous bases (as NaN).
        if sorted(self.alphabet) != ["A", "C", "G", "T"]:
            raise ValueError(
                "PSSM has wrong alphabet: %s - Use only with DNA motifs" % self.alphabet
            )

        # NOTE: The C code handles mixed case input as this could be large
        # (e.g. contig or chromosome), so requiring it be all upper or lower
        # case would impose an overhead to allocate the extra memory.
        try:
            sequence = bytes(sequence)
        except TypeError:  # str
            try:
                sequence = bytes(sequence, "ASCII")
            except TypeError:
                raise ValueError(
                    "sequence should be a Seq, MutableSeq, string, or bytes-like object"
                ) from None
            except UnicodeEncodeError:
                raise ValueError(
                    "sequence should contain ASCII characters only"
                ) from None
        except Exception:
            raise ValueError(
                "sequence should be a Seq, MutableSeq, string, or bytes-like object"
            ) from None

        n = len(sequence)
        m = self.length
        # Create the numpy arrays here; the C module then does not rely on numpy
        # Use a float32 for the scores array to save space
        scores = np.empty(n - m + 1, np.float32)
        logodds = np.array(
            [[self[letter][i] for letter in "ACGT"] for i in range(m)], float
        )
        _pwm.calculate(sequence, logodds, scores)

        if len(scores) == 1:
            return scores[0]
        else:
            return scores

    def search(self, sequence, threshold=0.0, both=True, chunksize=10**6):
        """Find hits with PWM score above given threshold.

        A generator function, returning found hits in the given sequence
        with the pwm score higher than the threshold.
        """
        sequence = sequence.upper()
        seq_len = len(sequence)
        motif_l = self.length
        chunk_starts = np.arange(0, seq_len, chunksize)
        if both:
            rc = self.reverse_complement()
        for chunk_start in chunk_starts:
            subseq = sequence[chunk_start : chunk_start + chunksize + motif_l - 1]
            pos_scores = self.calculate(subseq)
            pos_ind = pos_scores >= threshold
            pos_positions = np.where(pos_ind)[0] + chunk_start
            pos_scores = pos_scores[pos_ind]
            if both:
                neg_scores = rc.calculate(subseq)
                neg_ind = neg_scores >= threshold
                neg_positions = np.where(neg_ind)[0] + chunk_start
                neg_scores = neg_scores[neg_ind]
            else:
                neg_positions = np.empty((0), dtype=int)
                neg_scores = np.empty((0), dtype=int)
            chunk_positions = np.append(pos_positions, neg_positions - seq_len)
            chunk_scores = np.append(pos_scores, neg_scores)
            order = np.argsort(np.append(pos_positions, neg_positions))
            chunk_positions = chunk_positions[order]
            chunk_scores = chunk_scores[order]
            yield from zip(chunk_positions, chunk_scores)

    @property
    def max(self):
        """Maximal possible score for this motif.

        returns the score computed for the consensus sequence.
        """
        score = 0.0
        letters = self.alphabet
        for position in range(0, self.length):
            score += max(self[letter][position] for letter in letters)
        return score

    @property
    def min(self):
        """Minimal possible score for this motif.

        returns the score computed for the anticonsensus sequence.
        """
        score = 0.0
        letters = self.alphabet
        for position in range(0, self.length):
            score += min(self[letter][position] for letter in letters)
        return score

    @property
    def gc_content(self):
        """Compute the GC-ratio."""
        raise Exception("Cannot compute the %GC composition of a PSSM")

    def mean(self, background=None):
        """Return expected value of the score of a motif."""
        if background is None:
            background = dict.fromkeys(self.alphabet, 1.0)
        else:
            background = dict(background)
        total = sum(background.values())
        for letter in self.alphabet:
            background[letter] /= total
        sx = 0.0
        for i in range(self.length):
            for letter in self.alphabet:
                logodds = self[letter, i]
                if math.isnan(logodds):
                    continue
                if math.isinf(logodds) and logodds < 0:
                    continue
                b = background[letter]
                p = b * math.pow(2, logodds)
                sx += p * logodds
        return sx

    def std(self, background=None):
        """Return standard deviation of the score of a motif."""
        if background is None:
            background = dict.fromkeys(self.alphabet, 1.0)
        else:
            background = dict(background)
        total = sum(background.values())
        for letter in self.alphabet:
            background[letter] /= total
        variance = 0.0
        for i in range(self.length):
            sx = 0.0
            sxx = 0.0
            for letter in self.alphabet:
                logodds = self[letter, i]
                if math.isnan(logodds):
                    continue
                if math.isinf(logodds) and logodds < 0:
                    continue
                b = background[letter]
                p = b * math.pow(2, logodds)
                sx += p * logodds
                sxx += p * logodds * logodds
            sxx -= sx * sx
            variance += sxx
        variance = max(variance, 0)  # to avoid roundoff problems
        return math.sqrt(variance)

    def dist_pearson(self, other):
        """Return the similarity score based on pearson correlation for the given motif against self.

        We use the Pearson's correlation of the respective probabilities.
        """
        if self.alphabet != other.alphabet:
            raise ValueError("Cannot compare motifs with different alphabets")

        max_p = -2
        for offset in range(-self.length + 1, other.length):
            if offset < 0:
                p = self.dist_pearson_at(other, -offset)
            else:  # offset>=0
                p = other.dist_pearson_at(self, offset)
            if max_p < p:
                max_p = p
                max_o = -offset
        return 1 - max_p, max_o

    def dist_pearson_at(self, other, offset):
        """Return the similarity score based on pearson correlation at the given offset."""
        letters = self.alphabet
        sx = 0.0  # \sum x
        sy = 0.0  # \sum y
        sxx = 0.0  # \sum x^2
        sxy = 0.0  # \sum x \cdot y
        syy = 0.0  # \sum y^2
        norm = max(self.length, offset + other.length) * len(letters)
        for pos in range(min(self.length - offset, other.length)):
            xi = [self[letter, pos + offset] for letter in letters]
            yi = [other[letter, pos] for letter in letters]
            sx += sum(xi)
            sy += sum(yi)
            sxx += sum(x * x for x in xi)
            sxy += sum(x * y for x, y in zip(xi, yi))
            syy += sum(y * y for y in yi)
        sx /= norm
        sy /= norm
        sxx /= norm
        sxy /= norm
        syy /= norm
        numerator = sxy - sx * sy
        denominator = math.sqrt((sxx - sx * sx) * (syy - sy * sy))
        return numerator / denominator

    def distribution(self, background=None, precision=10**3):
        """Calculate the distribution of the scores at the given precision."""
        from .thresholds import ScoreDistribution

        if background is None:
            background = dict.fromkeys(self.alphabet, 1.0)
        else:
            background = dict(background)
        total = sum(background.values())
        for letter in self.alphabet:
            background[letter] /= total
        return ScoreDistribution(precision=precision, pssm=self, background=background)