File size: 9,571 Bytes
7885a28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import operator
from dataclasses import dataclass
import numpy as np
from scipy.special import ndtri
from ._common import ConfidenceInterval


def _validate_int(n, bound, name):
    msg = f'{name} must be an integer not less than {bound}, but got {n!r}'
    try:
        n = operator.index(n)
    except TypeError:
        raise TypeError(msg) from None
    if n < bound:
        raise ValueError(msg)
    return n


@dataclass
class RelativeRiskResult:
    """
    Result of `scipy.stats.contingency.relative_risk`.

    Attributes
    ----------
    relative_risk : float
        This is::

            (exposed_cases/exposed_total) / (control_cases/control_total)

    exposed_cases : int
        The number of "cases" (i.e. occurrence of disease or other event
        of interest) among the sample of "exposed" individuals.
    exposed_total : int
        The total number of "exposed" individuals in the sample.
    control_cases : int
        The number of "cases" among the sample of "control" or non-exposed
        individuals.
    control_total : int
        The total number of "control" individuals in the sample.

    Methods
    -------
    confidence_interval :
        Compute the confidence interval for the relative risk estimate.
    """

    relative_risk: float
    exposed_cases: int
    exposed_total: int
    control_cases: int
    control_total: int

    def confidence_interval(self, confidence_level=0.95):
        """
        Compute the confidence interval for the relative risk.

        The confidence interval is computed using the Katz method
        (i.e. "Method C" of [1]_; see also [2]_, section 3.1.2).

        Parameters
        ----------
        confidence_level : float, optional
            The confidence level to use for the confidence interval.
            Default is 0.95.

        Returns
        -------
        ci : ConfidenceInterval instance
            The return value is an object with attributes ``low`` and
            ``high`` that hold the confidence interval.

        References
        ----------
        .. [1] D. Katz, J. Baptista, S. P. Azen and M. C. Pike, "Obtaining
               confidence intervals for the risk ratio in cohort studies",
               Biometrics, 34, 469-474 (1978).
        .. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
               CRC Press LLC, Boca Raton, FL, USA (1996).


        Examples
        --------
        >>> from scipy.stats.contingency import relative_risk
        >>> result = relative_risk(exposed_cases=10, exposed_total=75,
        ...                        control_cases=12, control_total=225)
        >>> result.relative_risk
        2.5
        >>> result.confidence_interval()
        ConfidenceInterval(low=1.1261564003469628, high=5.549850800541033)
        """
        if not 0 <= confidence_level <= 1:
            raise ValueError('confidence_level must be in the interval '
                             '[0, 1].')

        # Handle edge cases where either exposed_cases or control_cases
        # is zero.  We follow the convention of the R function riskratio
        # from the epitools library.
        if self.exposed_cases == 0 and self.control_cases == 0:
            # relative risk is nan.
            return ConfidenceInterval(low=np.nan, high=np.nan)
        elif self.exposed_cases == 0:
            # relative risk is 0.
            return ConfidenceInterval(low=0.0, high=np.nan)
        elif self.control_cases == 0:
            # relative risk is inf
            return ConfidenceInterval(low=np.nan, high=np.inf)

        alpha = 1 - confidence_level
        z = ndtri(1 - alpha/2)
        rr = self.relative_risk

        # Estimate of the variance of log(rr) is
        # var(log(rr)) = 1/exposed_cases - 1/exposed_total +
        #                1/control_cases - 1/control_total
        # and the standard error is the square root of that.
        se = np.sqrt(1/self.exposed_cases - 1/self.exposed_total +
                     1/self.control_cases - 1/self.control_total)
        delta = z*se
        katz_lo = rr*np.exp(-delta)
        katz_hi = rr*np.exp(delta)
        return ConfidenceInterval(low=katz_lo, high=katz_hi)


def relative_risk(exposed_cases, exposed_total, control_cases, control_total):
    """
    Compute the relative risk (also known as the risk ratio).

    This function computes the relative risk associated with a 2x2
    contingency table ([1]_, section 2.2.3; [2]_, section 3.1.2). Instead
    of accepting a table as an argument, the individual numbers that are
    used to compute the relative risk are given as separate parameters.
    This is to avoid the ambiguity of which row or column of the contingency
    table corresponds to the "exposed" cases and which corresponds to the
    "control" cases.  Unlike, say, the odds ratio, the relative risk is not
    invariant under an interchange of the rows or columns.

    Parameters
    ----------
    exposed_cases : nonnegative int
        The number of "cases" (i.e. occurrence of disease or other event
        of interest) among the sample of "exposed" individuals.
    exposed_total : positive int
        The total number of "exposed" individuals in the sample.
    control_cases : nonnegative int
        The number of "cases" among the sample of "control" or non-exposed
        individuals.
    control_total : positive int
        The total number of "control" individuals in the sample.

    Returns
    -------
    result : instance of `~scipy.stats._result_classes.RelativeRiskResult`
        The object has the float attribute ``relative_risk``, which is::

            rr = (exposed_cases/exposed_total) / (control_cases/control_total)

        The object also has the method ``confidence_interval`` to compute
        the confidence interval of the relative risk for a given confidence
        level.

    See Also
    --------
    odds_ratio

    Notes
    -----
    The R package epitools has the function `riskratio`, which accepts
    a table with the following layout::

                        disease=0   disease=1
        exposed=0 (ref)    n00         n01
        exposed=1          n10         n11

    With a 2x2 table in the above format, the estimate of the CI is
    computed by `riskratio` when the argument method="wald" is given,
    or with the function `riskratio.wald`.

    For example, in a test of the incidence of lung cancer among a
    sample of smokers and nonsmokers, the "exposed" category would
    correspond to "is a smoker" and the "disease" category would
    correspond to "has or had lung cancer".

    To pass the same data to ``relative_risk``, use::

        relative_risk(n11, n10 + n11, n01, n00 + n01)

    .. versionadded:: 1.7.0

    References
    ----------
    .. [1] Alan Agresti, An Introduction to Categorical Data Analysis
           (second edition), Wiley, Hoboken, NJ, USA (2007).
    .. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
           CRC Press LLC, Boca Raton, FL, USA (1996).

    Examples
    --------
    >>> from scipy.stats.contingency import relative_risk

    This example is from Example 3.1 of [2]_.  The results of a heart
    disease study are summarized in the following table::

                 High CAT   Low CAT    Total
                 --------   -------    -----
        CHD         27         44        71
        No CHD      95        443       538

        Total      122        487       609

    CHD is coronary heart disease, and CAT refers to the level of
    circulating catecholamine.  CAT is the "exposure" variable, and
    high CAT is the "exposed" category. So the data from the table
    to be passed to ``relative_risk`` is::

        exposed_cases = 27
        exposed_total = 122
        control_cases = 44
        control_total = 487

    >>> result = relative_risk(27, 122, 44, 487)
    >>> result.relative_risk
    2.4495156482861398

    Find the confidence interval for the relative risk.

    >>> result.confidence_interval(confidence_level=0.95)
    ConfidenceInterval(low=1.5836990926700116, high=3.7886786315466354)

    The interval does not contain 1, so the data supports the statement
    that high CAT is associated with greater risk of CHD.
    """
    # Relative risk is a trivial calculation.  The nontrivial part is in the
    # `confidence_interval` method of the RelativeRiskResult class.

    exposed_cases = _validate_int(exposed_cases, 0, "exposed_cases")
    exposed_total = _validate_int(exposed_total, 1, "exposed_total")
    control_cases = _validate_int(control_cases, 0, "control_cases")
    control_total = _validate_int(control_total, 1, "control_total")

    if exposed_cases > exposed_total:
        raise ValueError('exposed_cases must not exceed exposed_total.')
    if control_cases > control_total:
        raise ValueError('control_cases must not exceed control_total.')

    if exposed_cases == 0 and control_cases == 0:
        # relative risk is 0/0.
        rr = np.nan
    elif exposed_cases == 0:
        # relative risk is 0/nonzero
        rr = 0.0
    elif control_cases == 0:
        # relative risk is nonzero/0.
        rr = np.inf
    else:
        p1 = exposed_cases / exposed_total
        p2 = control_cases / control_total
        rr = p1 / p2
    return RelativeRiskResult(relative_risk=rr,
                              exposed_cases=exposed_cases,
                              exposed_total=exposed_total,
                              control_cases=control_cases,
                              control_total=control_total)