File size: 4,530 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# Copyright 2008 by Norbert Dojer.  All rights reserved.
# Adapted by Bartek Wilczynski.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
"""Approximate calculation of appropriate thresholds for motif finding."""


class ScoreDistribution:
    """Class representing approximate score distribution for a given motif.

    Utilizes a dynamic programming approach to calculate the distribution of
    scores with a predefined precision. Provides a number of methods for calculating
    thresholds for motif occurrences.
    """

    def __init__(self, motif=None, precision=10**3, pssm=None, background=None):
        """Initialize the class."""
        if pssm is None:
            self.min_score = min(0.0, motif.min_score())
            self.interval = max(0.0, motif.max_score()) - self.min_score
            self.n_points = precision * motif.length
            self.ic = motif.ic()
        else:
            self.min_score = min(0.0, pssm.min)
            self.interval = max(0.0, pssm.max) - self.min_score
            self.n_points = precision * pssm.length
            self.ic = pssm.mean(background)
        self.step = self.interval / (self.n_points - 1)
        self.mo_density = [0.0] * self.n_points
        self.mo_density[-self._index_diff(self.min_score)] = 1.0
        self.bg_density = [0.0] * self.n_points
        self.bg_density[-self._index_diff(self.min_score)] = 1.0
        if pssm is None:
            for lo, mo in zip(motif.log_odds(), motif.pwm()):
                self.modify(lo, mo, motif.background)
        else:
            for position in range(pssm.length):
                mo_new = [0.0] * self.n_points
                bg_new = [0.0] * self.n_points
                lo = pssm[:, position]
                for letter, score in lo.items():
                    bg = background[letter]
                    mo = pow(2, pssm[letter, position]) * bg
                    d = self._index_diff(score)
                    for i in range(self.n_points):
                        mo_new[self._add(i, d)] += self.mo_density[i] * mo
                        bg_new[self._add(i, d)] += self.bg_density[i] * bg
                self.mo_density = mo_new
                self.bg_density = bg_new

    def _index_diff(self, x, y=0.0):
        return int((x - y + 0.5 * self.step) // self.step)

    def _add(self, i, j):
        return max(0, min(self.n_points - 1, i + j))

    def modify(self, scores, mo_probs, bg_probs):
        """Modify motifs and background density."""
        mo_new = [0.0] * self.n_points
        bg_new = [0.0] * self.n_points
        for k, v in scores.items():
            d = self._index_diff(v)
            for i in range(self.n_points):
                mo_new[self._add(i, d)] += self.mo_density[i] * mo_probs[k]
                bg_new[self._add(i, d)] += self.bg_density[i] * bg_probs[k]
        self.mo_density = mo_new
        self.bg_density = bg_new

    def threshold_fpr(self, fpr):
        """Approximate the log-odds threshold which makes the type I error (false positive rate)."""
        i = self.n_points
        prob = 0.0
        while prob < fpr:
            i -= 1
            prob += self.bg_density[i]
        return self.min_score + i * self.step

    def threshold_fnr(self, fnr):
        """Approximate the log-odds threshold which makes the type II error (false negative rate)."""
        i = -1
        prob = 0.0
        while prob < fnr:
            i += 1
            prob += self.mo_density[i]
        return self.min_score + i * self.step

    def threshold_balanced(self, rate_proportion=1.0, return_rate=False):
        """Approximate log-odds threshold making FNR equal to FPR times rate_proportion."""
        i = self.n_points
        fpr = 0.0
        fnr = 1.0
        while fpr * rate_proportion < fnr:
            i -= 1
            fpr += self.bg_density[i]
            fnr -= self.mo_density[i]
        if return_rate:
            return self.min_score + i * self.step, fpr
        else:
            return self.min_score + i * self.step

    def threshold_patser(self):
        """Threshold selection mimicking the behaviour of patser (Hertz, Stormo 1999) software.

        It selects such a threshold that the log(fpr)=-ic(M)
        note: the actual patser software uses natural logarithms instead of log_2, so the numbers
        are not directly comparable.
        """
        return self.threshold_fpr(fpr=2**-self.ic)