|
from collections import namedtuple |
|
from dataclasses import dataclass |
|
from math import comb |
|
import numpy as np |
|
import warnings |
|
from itertools import combinations |
|
import scipy.stats |
|
from scipy.optimize import shgo |
|
from . import distributions |
|
from ._common import ConfidenceInterval |
|
from ._continuous_distns import norm |
|
from scipy.special import gamma, kv, gammaln |
|
from scipy.fft import ifft |
|
from ._stats_pythran import _a_ij_Aij_Dij2 |
|
from ._stats_pythran import ( |
|
_concordant_pairs as _P, _discordant_pairs as _Q |
|
) |
|
from ._axis_nan_policy import _axis_nan_policy_factory |
|
from scipy.stats import _stats_py |
|
|
|
__all__ = ['epps_singleton_2samp', 'cramervonmises', 'somersd', |
|
'barnard_exact', 'boschloo_exact', 'cramervonmises_2samp', |
|
'tukey_hsd', 'poisson_means_test'] |
|
|
|
Epps_Singleton_2sampResult = namedtuple('Epps_Singleton_2sampResult', |
|
('statistic', 'pvalue')) |
|
|
|
|
|
@_axis_nan_policy_factory(Epps_Singleton_2sampResult, n_samples=2, too_small=4) |
|
def epps_singleton_2samp(x, y, t=(0.4, 0.8)): |
|
"""Compute the Epps-Singleton (ES) test statistic. |
|
|
|
Test the null hypothesis that two samples have the same underlying |
|
probability distribution. |
|
|
|
Parameters |
|
---------- |
|
x, y : array-like |
|
The two samples of observations to be tested. Input must not have more |
|
than one dimension. Samples can have different lengths, but both |
|
must have at least five observations. |
|
t : array-like, optional |
|
The points (t1, ..., tn) where the empirical characteristic function is |
|
to be evaluated. It should be positive distinct numbers. The default |
|
value (0.4, 0.8) is proposed in [1]_. Input must not have more than |
|
one dimension. |
|
|
|
Returns |
|
------- |
|
statistic : float |
|
The test statistic. |
|
pvalue : float |
|
The associated p-value based on the asymptotic chi2-distribution. |
|
|
|
See Also |
|
-------- |
|
ks_2samp, anderson_ksamp |
|
|
|
Notes |
|
----- |
|
Testing whether two samples are generated by the same underlying |
|
distribution is a classical question in statistics. A widely used test is |
|
the Kolmogorov-Smirnov (KS) test which relies on the empirical |
|
distribution function. Epps and Singleton introduce a test based on the |
|
empirical characteristic function in [1]_. |
|
|
|
One advantage of the ES test compared to the KS test is that is does |
|
not assume a continuous distribution. In [1]_, the authors conclude |
|
that the test also has a higher power than the KS test in many |
|
examples. They recommend the use of the ES test for discrete samples as |
|
well as continuous samples with at least 25 observations each, whereas |
|
`anderson_ksamp` is recommended for smaller sample sizes in the |
|
continuous case. |
|
|
|
The p-value is computed from the asymptotic distribution of the test |
|
statistic which follows a `chi2` distribution. If the sample size of both |
|
`x` and `y` is below 25, the small sample correction proposed in [1]_ is |
|
applied to the test statistic. |
|
|
|
The default values of `t` are determined in [1]_ by considering |
|
various distributions and finding good values that lead to a high power |
|
of the test in general. Table III in [1]_ gives the optimal values for |
|
the distributions tested in that study. The values of `t` are scaled by |
|
the semi-interquartile range in the implementation, see [1]_. |
|
|
|
References |
|
---------- |
|
.. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample |
|
problem using the empirical characteristic function", Journal of |
|
Statistical Computation and Simulation 26, p. 177--203, 1986. |
|
|
|
.. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions |
|
- the Epps-Singleton two-sample test using the empirical characteristic |
|
function", The Stata Journal 9(3), p. 454--465, 2009. |
|
|
|
""" |
|
|
|
t = np.asarray(t) |
|
|
|
nx, ny = len(x), len(y) |
|
if (nx < 5) or (ny < 5): |
|
raise ValueError('x and y should have at least 5 elements, but len(x) ' |
|
f'= {nx} and len(y) = {ny}.') |
|
if not np.isfinite(x).all(): |
|
raise ValueError('x must not contain nonfinite values.') |
|
if not np.isfinite(y).all(): |
|
raise ValueError('y must not contain nonfinite values.') |
|
n = nx + ny |
|
|
|
|
|
if t.ndim > 1: |
|
raise ValueError(f't must be 1d, but t.ndim equals {t.ndim}.') |
|
if np.less_equal(t, 0).any(): |
|
raise ValueError('t must contain positive elements only.') |
|
|
|
|
|
|
|
from scipy.stats import iqr |
|
sigma = iqr(np.hstack((x, y))) / 2 |
|
ts = np.reshape(t, (-1, 1)) / sigma |
|
|
|
|
|
gx = np.vstack((np.cos(ts*x), np.sin(ts*x))).T |
|
gy = np.vstack((np.cos(ts*y), np.sin(ts*y))).T |
|
cov_x = np.cov(gx.T, bias=True) |
|
cov_y = np.cov(gy.T, bias=True) |
|
est_cov = (n/nx)*cov_x + (n/ny)*cov_y |
|
est_cov_inv = np.linalg.pinv(est_cov) |
|
r = np.linalg.matrix_rank(est_cov_inv) |
|
if r < 2*len(t): |
|
warnings.warn('Estimated covariance matrix does not have full rank. ' |
|
'This indicates a bad choice of the input t and the ' |
|
'test might not be consistent.', |
|
stacklevel=2) |
|
|
|
|
|
g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0) |
|
w = n*np.dot(g_diff.T, np.dot(est_cov_inv, g_diff)) |
|
|
|
|
|
if (max(nx, ny) < 25): |
|
corr = 1.0/(1.0 + n**(-0.45) + 10.1*(nx**(-1.7) + ny**(-1.7))) |
|
w = corr * w |
|
|
|
chi2 = _stats_py._SimpleChi2(r) |
|
p = _stats_py._get_pvalue(w, chi2, alternative='greater', symmetric=False, xp=np) |
|
|
|
return Epps_Singleton_2sampResult(w, p) |
|
|
|
|
|
def poisson_means_test(k1, n1, k2, n2, *, diff=0, alternative='two-sided'): |
|
r""" |
|
Performs the Poisson means test, AKA the "E-test". |
|
|
|
This is a test of the null hypothesis that the difference between means of |
|
two Poisson distributions is `diff`. The samples are provided as the |
|
number of events `k1` and `k2` observed within measurement intervals |
|
(e.g. of time, space, number of observations) of sizes `n1` and `n2`. |
|
|
|
Parameters |
|
---------- |
|
k1 : int |
|
Number of events observed from distribution 1. |
|
n1: float |
|
Size of sample from distribution 1. |
|
k2 : int |
|
Number of events observed from distribution 2. |
|
n2 : float |
|
Size of sample from distribution 2. |
|
diff : float, default=0 |
|
The hypothesized difference in means between the distributions |
|
underlying the samples. |
|
alternative : {'two-sided', 'less', 'greater'}, optional |
|
Defines the alternative hypothesis. |
|
The following options are available (default is 'two-sided'): |
|
|
|
* 'two-sided': the difference between distribution means is not |
|
equal to `diff` |
|
* 'less': the difference between distribution means is less than |
|
`diff` |
|
* 'greater': the difference between distribution means is greater |
|
than `diff` |
|
|
|
Returns |
|
------- |
|
statistic : float |
|
The test statistic (see [1]_ equation 3.3). |
|
pvalue : float |
|
The probability of achieving such an extreme value of the test |
|
statistic under the null hypothesis. |
|
|
|
Notes |
|
----- |
|
|
|
Let: |
|
|
|
.. math:: X_1 \sim \mbox{Poisson}(\mathtt{n1}\lambda_1) |
|
|
|
be a random variable independent of |
|
|
|
.. math:: X_2 \sim \mbox{Poisson}(\mathtt{n2}\lambda_2) |
|
|
|
and let ``k1`` and ``k2`` be the observed values of :math:`X_1` |
|
and :math:`X_2`, respectively. Then `poisson_means_test` uses the number |
|
of observed events ``k1`` and ``k2`` from samples of size ``n1`` and |
|
``n2``, respectively, to test the null hypothesis that |
|
|
|
.. math:: |
|
H_0: \lambda_1 - \lambda_2 = \mathtt{diff} |
|
|
|
A benefit of the E-test is that it has good power for small sample sizes, |
|
which can reduce sampling costs [1]_. It has been evaluated and determined |
|
to be more powerful than the comparable C-test, sometimes referred to as |
|
the Poisson exact test. |
|
|
|
References |
|
---------- |
|
.. [1] Krishnamoorthy, K., & Thomson, J. (2004). A more powerful test for |
|
comparing two Poisson means. Journal of Statistical Planning and |
|
Inference, 119(1), 23-35. |
|
|
|
.. [2] Przyborowski, J., & Wilenski, H. (1940). Homogeneity of results in |
|
testing samples from Poisson series: With an application to testing |
|
clover seed for dodder. Biometrika, 31(3/4), 313-323. |
|
|
|
Examples |
|
-------- |
|
|
|
Suppose that a gardener wishes to test the number of dodder (weed) seeds |
|
in a sack of clover seeds that they buy from a seed company. It has |
|
previously been established that the number of dodder seeds in clover |
|
follows the Poisson distribution. |
|
|
|
A 100 gram sample is drawn from the sack before being shipped to the |
|
gardener. The sample is analyzed, and it is found to contain no dodder |
|
seeds; that is, `k1` is 0. However, upon arrival, the gardener draws |
|
another 100 gram sample from the sack. This time, three dodder seeds are |
|
found in the sample; that is, `k2` is 3. The gardener would like to |
|
know if the difference is significant and not due to chance. The |
|
null hypothesis is that the difference between the two samples is merely |
|
due to chance, or that :math:`\lambda_1 - \lambda_2 = \mathtt{diff}` |
|
where :math:`\mathtt{diff} = 0`. The alternative hypothesis is that the |
|
difference is not due to chance, or :math:`\lambda_1 - \lambda_2 \ne 0`. |
|
The gardener selects a significance level of 5% to reject the null |
|
hypothesis in favor of the alternative [2]_. |
|
|
|
>>> import scipy.stats as stats |
|
>>> res = stats.poisson_means_test(0, 100, 3, 100) |
|
>>> res.statistic, res.pvalue |
|
(-1.7320508075688772, 0.08837900929018157) |
|
|
|
The p-value is .088, indicating a near 9% chance of observing a value of |
|
the test statistic under the null hypothesis. This exceeds 5%, so the |
|
gardener does not reject the null hypothesis as the difference cannot be |
|
regarded as significant at this level. |
|
""" |
|
|
|
_poisson_means_test_iv(k1, n1, k2, n2, diff, alternative) |
|
|
|
|
|
lmbd_hat2 = ((k1 + k2) / (n1 + n2) - diff * n1 / (n1 + n2)) |
|
|
|
|
|
|
|
|
|
if lmbd_hat2 <= 0: |
|
return _stats_py.SignificanceResult(0, 1) |
|
|
|
|
|
var = k1 / (n1 ** 2) + k2 / (n2 ** 2) |
|
|
|
|
|
|
|
|
|
t_k1k2 = (k1 / n1 - k2 / n2 - diff) / np.sqrt(var) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nlmbd_hat1 = n1 * (lmbd_hat2 + diff) |
|
nlmbd_hat2 = n2 * lmbd_hat2 |
|
|
|
|
|
|
|
|
|
x1_lb, x1_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat1) |
|
x2_lb, x2_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat2) |
|
|
|
|
|
|
|
|
|
x1 = np.arange(x1_lb, x1_ub + 1) |
|
x2 = np.arange(x2_lb, x2_ub + 1)[:, None] |
|
|
|
|
|
|
|
|
|
|
|
prob_x1 = distributions.poisson.pmf(x1, nlmbd_hat1) |
|
prob_x2 = distributions.poisson.pmf(x2, nlmbd_hat2) |
|
|
|
|
|
|
|
lmbd_x1 = x1 / n1 |
|
lmbd_x2 = x2 / n2 |
|
lmbds_diff = lmbd_x1 - lmbd_x2 - diff |
|
var_x1x2 = lmbd_x1 / n1 + lmbd_x2 / n2 |
|
|
|
|
|
|
|
with np.errstate(invalid='ignore', divide='ignore'): |
|
t_x1x2 = lmbds_diff / np.sqrt(var_x1x2) |
|
|
|
|
|
|
|
if alternative == 'two-sided': |
|
indicator = np.abs(t_x1x2) >= np.abs(t_k1k2) |
|
elif alternative == 'less': |
|
indicator = t_x1x2 <= t_k1k2 |
|
else: |
|
indicator = t_x1x2 >= t_k1k2 |
|
|
|
|
|
|
|
pvalue = np.sum((prob_x1 * prob_x2)[indicator]) |
|
return _stats_py.SignificanceResult(t_k1k2, pvalue) |
|
|
|
|
|
def _poisson_means_test_iv(k1, n1, k2, n2, diff, alternative): |
|
|
|
if k1 != int(k1) or k2 != int(k2): |
|
raise TypeError('`k1` and `k2` must be integers.') |
|
|
|
count_err = '`k1` and `k2` must be greater than or equal to 0.' |
|
if k1 < 0 or k2 < 0: |
|
raise ValueError(count_err) |
|
|
|
if n1 <= 0 or n2 <= 0: |
|
raise ValueError('`n1` and `n2` must be greater than 0.') |
|
|
|
if diff < 0: |
|
raise ValueError('diff must be greater than or equal to 0.') |
|
|
|
alternatives = {'two-sided', 'less', 'greater'} |
|
if alternative.lower() not in alternatives: |
|
raise ValueError(f"Alternative must be one of '{alternatives}'.") |
|
|
|
|
|
class CramerVonMisesResult: |
|
def __init__(self, statistic, pvalue): |
|
self.statistic = statistic |
|
self.pvalue = pvalue |
|
|
|
def __repr__(self): |
|
return (f"{self.__class__.__name__}(statistic={self.statistic}, " |
|
f"pvalue={self.pvalue})") |
|
|
|
|
|
def _psi1_mod(x): |
|
""" |
|
psi1 is defined in equation 1.10 in Csörgő, S. and Faraway, J. (1996). |
|
This implements a modified version by excluding the term V(x) / 12 |
|
(here: _cdf_cvm_inf(x) / 12) to avoid evaluating _cdf_cvm_inf(x) |
|
twice in _cdf_cvm. |
|
|
|
Implementation based on MAPLE code of Julian Faraway and R code of the |
|
function pCvM in the package goftest (v1.1.1), permission granted |
|
by Adrian Baddeley. Main difference in the implementation: the code |
|
here keeps adding terms of the series until the terms are small enough. |
|
""" |
|
|
|
def _ed2(y): |
|
z = y**2 / 4 |
|
b = kv(1/4, z) + kv(3/4, z) |
|
return np.exp(-z) * (y/2)**(3/2) * b / np.sqrt(np.pi) |
|
|
|
def _ed3(y): |
|
z = y**2 / 4 |
|
c = np.exp(-z) / np.sqrt(np.pi) |
|
return c * (y/2)**(5/2) * (2*kv(1/4, z) + 3*kv(3/4, z) - kv(5/4, z)) |
|
|
|
def _Ak(k, x): |
|
m = 2*k + 1 |
|
sx = 2 * np.sqrt(x) |
|
y1 = x**(3/4) |
|
y2 = x**(5/4) |
|
|
|
e1 = m * gamma(k + 1/2) * _ed2((4 * k + 3)/sx) / (9 * y1) |
|
e2 = gamma(k + 1/2) * _ed3((4 * k + 1) / sx) / (72 * y2) |
|
e3 = 2 * (m + 2) * gamma(k + 3/2) * _ed3((4 * k + 5) / sx) / (12 * y2) |
|
e4 = 7 * m * gamma(k + 1/2) * _ed2((4 * k + 1) / sx) / (144 * y1) |
|
e5 = 7 * m * gamma(k + 1/2) * _ed2((4 * k + 5) / sx) / (144 * y1) |
|
|
|
return e1 + e2 + e3 + e4 + e5 |
|
|
|
x = np.asarray(x) |
|
tot = np.zeros_like(x, dtype='float') |
|
cond = np.ones_like(x, dtype='bool') |
|
k = 0 |
|
while np.any(cond): |
|
z = -_Ak(k, x[cond]) / (np.pi * gamma(k + 1)) |
|
tot[cond] = tot[cond] + z |
|
cond[cond] = np.abs(z) >= 1e-7 |
|
k += 1 |
|
|
|
return tot |
|
|
|
|
|
def _cdf_cvm_inf(x): |
|
""" |
|
Calculate the cdf of the Cramér-von Mises statistic (infinite sample size). |
|
|
|
See equation 1.2 in Csörgő, S. and Faraway, J. (1996). |
|
|
|
Implementation based on MAPLE code of Julian Faraway and R code of the |
|
function pCvM in the package goftest (v1.1.1), permission granted |
|
by Adrian Baddeley. Main difference in the implementation: the code |
|
here keeps adding terms of the series until the terms are small enough. |
|
|
|
The function is not expected to be accurate for large values of x, say |
|
x > 4, when the cdf is very close to 1. |
|
""" |
|
x = np.asarray(x) |
|
|
|
def term(x, k): |
|
|
|
u = np.exp(gammaln(k + 0.5) - gammaln(k+1)) / (np.pi**1.5 * np.sqrt(x)) |
|
y = 4*k + 1 |
|
q = y**2 / (16*x) |
|
b = kv(0.25, q) |
|
return u * np.sqrt(y) * np.exp(-q) * b |
|
|
|
tot = np.zeros_like(x, dtype='float') |
|
cond = np.ones_like(x, dtype='bool') |
|
k = 0 |
|
while np.any(cond): |
|
z = term(x[cond], k) |
|
tot[cond] = tot[cond] + z |
|
cond[cond] = np.abs(z) >= 1e-7 |
|
k += 1 |
|
|
|
return tot |
|
|
|
|
|
def _cdf_cvm(x, n=None): |
|
""" |
|
Calculate the cdf of the Cramér-von Mises statistic for a finite sample |
|
size n. If N is None, use the asymptotic cdf (n=inf). |
|
|
|
See equation 1.8 in Csörgő, S. and Faraway, J. (1996) for finite samples, |
|
1.2 for the asymptotic cdf. |
|
|
|
The function is not expected to be accurate for large values of x, say |
|
x > 2, when the cdf is very close to 1 and it might return values > 1 |
|
in that case, e.g. _cdf_cvm(2.0, 12) = 1.0000027556716846. Moreover, it |
|
is not accurate for small values of n, especially close to the bounds of |
|
the distribution's domain, [1/(12*n), n/3], where the value jumps to 0 |
|
and 1, respectively. These are limitations of the approximation by Csörgő |
|
and Faraway (1996) implemented in this function. |
|
""" |
|
x = np.asarray(x) |
|
if n is None: |
|
y = _cdf_cvm_inf(x) |
|
else: |
|
|
|
y = np.zeros_like(x, dtype='float') |
|
sup = (1./(12*n) < x) & (x < n/3.) |
|
|
|
|
|
y[sup] = _cdf_cvm_inf(x[sup]) * (1 + 1./(12*n)) + _psi1_mod(x[sup]) / n |
|
y[x >= n/3] = 1 |
|
|
|
if y.ndim == 0: |
|
return y[()] |
|
return y |
|
|
|
|
|
def _cvm_result_to_tuple(res): |
|
return res.statistic, res.pvalue |
|
|
|
|
|
@_axis_nan_policy_factory(CramerVonMisesResult, n_samples=1, too_small=1, |
|
result_to_tuple=_cvm_result_to_tuple) |
|
def cramervonmises(rvs, cdf, args=()): |
|
"""Perform the one-sample Cramér-von Mises test for goodness of fit. |
|
|
|
This performs a test of the goodness of fit of a cumulative distribution |
|
function (cdf) :math:`F` compared to the empirical distribution function |
|
:math:`F_n` of observed random variates :math:`X_1, ..., X_n` that are |
|
assumed to be independent and identically distributed ([1]_). |
|
The null hypothesis is that the :math:`X_i` have cumulative distribution |
|
:math:`F`. |
|
|
|
Parameters |
|
---------- |
|
rvs : array_like |
|
A 1-D array of observed values of the random variables :math:`X_i`. |
|
The sample must contain at least two observations. |
|
cdf : str or callable |
|
The cumulative distribution function :math:`F` to test the |
|
observations against. If a string, it should be the name of a |
|
distribution in `scipy.stats`. If a callable, that callable is used |
|
to calculate the cdf: ``cdf(x, *args) -> float``. |
|
args : tuple, optional |
|
Distribution parameters. These are assumed to be known; see Notes. |
|
|
|
Returns |
|
------- |
|
res : object with attributes |
|
statistic : float |
|
Cramér-von Mises statistic. |
|
pvalue : float |
|
The p-value. |
|
|
|
See Also |
|
-------- |
|
kstest, cramervonmises_2samp |
|
|
|
Notes |
|
----- |
|
.. versionadded:: 1.6.0 |
|
|
|
The p-value relies on the approximation given by equation 1.8 in [2]_. |
|
It is important to keep in mind that the p-value is only accurate if |
|
one tests a simple hypothesis, i.e. the parameters of the reference |
|
distribution are known. If the parameters are estimated from the data |
|
(composite hypothesis), the computed p-value is not reliable. |
|
|
|
References |
|
---------- |
|
.. [1] Cramér-von Mises criterion, Wikipedia, |
|
https://en.wikipedia.org/wiki/Cram%C3%A9r%E2%80%93von_Mises_criterion |
|
.. [2] Csörgő, S. and Faraway, J. (1996). The Exact and Asymptotic |
|
Distribution of Cramér-von Mises Statistics. Journal of the |
|
Royal Statistical Society, pp. 221-234. |
|
|
|
Examples |
|
-------- |
|
|
|
Suppose we wish to test whether data generated by ``scipy.stats.norm.rvs`` |
|
were, in fact, drawn from the standard normal distribution. We choose a |
|
significance level of ``alpha=0.05``. |
|
|
|
>>> import numpy as np |
|
>>> from scipy import stats |
|
>>> rng = np.random.default_rng(165417232101553420507139617764912913465) |
|
>>> x = stats.norm.rvs(size=500, random_state=rng) |
|
>>> res = stats.cramervonmises(x, 'norm') |
|
>>> res.statistic, res.pvalue |
|
(0.1072085112565724, 0.5508482238203407) |
|
|
|
The p-value exceeds our chosen significance level, so we do not |
|
reject the null hypothesis that the observed sample is drawn from the |
|
standard normal distribution. |
|
|
|
Now suppose we wish to check whether the same samples shifted by 2.1 is |
|
consistent with being drawn from a normal distribution with a mean of 2. |
|
|
|
>>> y = x + 2.1 |
|
>>> res = stats.cramervonmises(y, 'norm', args=(2,)) |
|
>>> res.statistic, res.pvalue |
|
(0.8364446265294695, 0.00596286797008283) |
|
|
|
Here we have used the `args` keyword to specify the mean (``loc``) |
|
of the normal distribution to test the data against. This is equivalent |
|
to the following, in which we create a frozen normal distribution with |
|
mean 2.1, then pass its ``cdf`` method as an argument. |
|
|
|
>>> frozen_dist = stats.norm(loc=2) |
|
>>> res = stats.cramervonmises(y, frozen_dist.cdf) |
|
>>> res.statistic, res.pvalue |
|
(0.8364446265294695, 0.00596286797008283) |
|
|
|
In either case, we would reject the null hypothesis that the observed |
|
sample is drawn from a normal distribution with a mean of 2 (and default |
|
variance of 1) because the p-value is less than our chosen |
|
significance level. |
|
|
|
""" |
|
if isinstance(cdf, str): |
|
cdf = getattr(distributions, cdf).cdf |
|
|
|
vals = np.sort(np.asarray(rvs)) |
|
|
|
if vals.size <= 1: |
|
raise ValueError('The sample must contain at least two observations.') |
|
|
|
n = len(vals) |
|
cdfvals = cdf(vals, *args) |
|
|
|
u = (2*np.arange(1, n+1) - 1)/(2*n) |
|
w = 1/(12*n) + np.sum((u - cdfvals)**2) |
|
|
|
|
|
p = np.clip(1. - _cdf_cvm(w, n), 0., None) |
|
|
|
return CramerVonMisesResult(statistic=w, pvalue=p) |
|
|
|
|
|
def _get_wilcoxon_distr(n): |
|
""" |
|
Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum |
|
of ranks of positive differences). |
|
Returns an array with the probabilities of all the possible ranks |
|
r = 0, ..., n*(n+1)/2 |
|
""" |
|
c = np.ones(1, dtype=np.float64) |
|
for k in range(1, n + 1): |
|
prev_c = c |
|
c = np.zeros(k * (k + 1) // 2 + 1, dtype=np.float64) |
|
m = len(prev_c) |
|
c[:m] = prev_c * 0.5 |
|
c[-m:] += prev_c * 0.5 |
|
return c |
|
|
|
|
|
def _get_wilcoxon_distr2(n): |
|
""" |
|
Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum |
|
of ranks of positive differences). |
|
Returns an array with the probabilities of all the possible ranks |
|
r = 0, ..., n*(n+1)/2 |
|
This is a slower reference function |
|
References |
|
---------- |
|
.. [1] 1. Harris T, Hardin JW. Exact Wilcoxon Signed-Rank and Wilcoxon |
|
Mann-Whitney Ranksum Tests. The Stata Journal. 2013;13(2):337-343. |
|
""" |
|
ai = np.arange(1, n+1)[:, None] |
|
t = n*(n+1)/2 |
|
q = 2*t |
|
j = np.arange(q) |
|
theta = 2*np.pi/q*j |
|
phi_sp = np.prod(np.cos(theta*ai), axis=0) |
|
phi_s = np.exp(1j*theta*t) * phi_sp |
|
p = np.real(ifft(phi_s)) |
|
res = np.zeros(int(t)+1) |
|
res[:-1:] = p[::2] |
|
res[0] /= 2 |
|
res[-1] = res[0] |
|
return res |
|
|
|
|
|
def _tau_b(A): |
|
"""Calculate Kendall's tau-b and p-value from contingency table.""" |
|
|
|
|
|
|
|
if A.shape[0] == 1 or A.shape[1] == 1: |
|
return np.nan, np.nan |
|
|
|
NA = A.sum() |
|
PA = _P(A) |
|
QA = _Q(A) |
|
Sri2 = (A.sum(axis=1)**2).sum() |
|
Scj2 = (A.sum(axis=0)**2).sum() |
|
denominator = (NA**2 - Sri2)*(NA**2 - Scj2) |
|
|
|
tau = (PA-QA)/(denominator)**0.5 |
|
|
|
numerator = 4*(_a_ij_Aij_Dij2(A) - (PA - QA)**2 / NA) |
|
s02_tau_b = numerator/denominator |
|
if s02_tau_b == 0: |
|
return tau, 0 |
|
Z = tau/s02_tau_b**0.5 |
|
p = 2*norm.sf(abs(Z)) |
|
|
|
return tau, p |
|
|
|
|
|
def _somers_d(A, alternative='two-sided'): |
|
"""Calculate Somers' D and p-value from contingency table.""" |
|
|
|
|
|
|
|
if A.shape[0] <= 1 or A.shape[1] <= 1: |
|
return np.nan, np.nan |
|
|
|
NA = A.sum() |
|
NA2 = NA**2 |
|
PA = _P(A) |
|
QA = _Q(A) |
|
Sri2 = (A.sum(axis=1)**2).sum() |
|
|
|
d = (PA - QA)/(NA2 - Sri2) |
|
|
|
S = _a_ij_Aij_Dij2(A) - (PA-QA)**2/NA |
|
|
|
with np.errstate(divide='ignore'): |
|
Z = (PA - QA)/(4*(S))**0.5 |
|
|
|
norm = _stats_py._SimpleNormal() |
|
p = _stats_py._get_pvalue(Z, norm, alternative, xp=np) |
|
|
|
return d, p |
|
|
|
|
|
@dataclass |
|
class SomersDResult: |
|
statistic: float |
|
pvalue: float |
|
table: np.ndarray |
|
|
|
|
|
def somersd(x, y=None, alternative='two-sided'): |
|
r"""Calculates Somers' D, an asymmetric measure of ordinal association. |
|
|
|
Like Kendall's :math:`\tau`, Somers' :math:`D` is a measure of the |
|
correspondence between two rankings. Both statistics consider the |
|
difference between the number of concordant and discordant pairs in two |
|
rankings :math:`X` and :math:`Y`, and both are normalized such that values |
|
close to 1 indicate strong agreement and values close to -1 indicate |
|
strong disagreement. They differ in how they are normalized. To show the |
|
relationship, Somers' :math:`D` can be defined in terms of Kendall's |
|
:math:`\tau_a`: |
|
|
|
.. math:: |
|
D(Y|X) = \frac{\tau_a(X, Y)}{\tau_a(X, X)} |
|
|
|
Suppose the first ranking :math:`X` has :math:`r` distinct ranks and the |
|
second ranking :math:`Y` has :math:`s` distinct ranks. These two lists of |
|
:math:`n` rankings can also be viewed as an :math:`r \times s` contingency |
|
table in which element :math:`i, j` is the number of rank pairs with rank |
|
:math:`i` in ranking :math:`X` and rank :math:`j` in ranking :math:`Y`. |
|
Accordingly, `somersd` also allows the input data to be supplied as a |
|
single, 2D contingency table instead of as two separate, 1D rankings. |
|
|
|
Note that the definition of Somers' :math:`D` is asymmetric: in general, |
|
:math:`D(Y|X) \neq D(X|Y)`. ``somersd(x, y)`` calculates Somers' |
|
:math:`D(Y|X)`: the "row" variable :math:`X` is treated as an independent |
|
variable, and the "column" variable :math:`Y` is dependent. For Somers' |
|
:math:`D(X|Y)`, swap the input lists or transpose the input table. |
|
|
|
Parameters |
|
---------- |
|
x : array_like |
|
1D array of rankings, treated as the (row) independent variable. |
|
Alternatively, a 2D contingency table. |
|
y : array_like, optional |
|
If `x` is a 1D array of rankings, `y` is a 1D array of rankings of the |
|
same length, treated as the (column) dependent variable. |
|
If `x` is 2D, `y` is ignored. |
|
alternative : {'two-sided', 'less', 'greater'}, optional |
|
Defines the alternative hypothesis. Default is 'two-sided'. |
|
The following options are available: |
|
* 'two-sided': the rank correlation is nonzero |
|
* 'less': the rank correlation is negative (less than zero) |
|
* 'greater': the rank correlation is positive (greater than zero) |
|
|
|
Returns |
|
------- |
|
res : SomersDResult |
|
A `SomersDResult` object with the following fields: |
|
|
|
statistic : float |
|
The Somers' :math:`D` statistic. |
|
pvalue : float |
|
The p-value for a hypothesis test whose null |
|
hypothesis is an absence of association, :math:`D=0`. |
|
See notes for more information. |
|
table : 2D array |
|
The contingency table formed from rankings `x` and `y` (or the |
|
provided contingency table, if `x` is a 2D array) |
|
|
|
See Also |
|
-------- |
|
kendalltau : Calculates Kendall's tau, another correlation measure. |
|
weightedtau : Computes a weighted version of Kendall's tau. |
|
spearmanr : Calculates a Spearman rank-order correlation coefficient. |
|
pearsonr : Calculates a Pearson correlation coefficient. |
|
|
|
Notes |
|
----- |
|
This function follows the contingency table approach of [2]_ and |
|
[3]_. *p*-values are computed based on an asymptotic approximation of |
|
the test statistic distribution under the null hypothesis :math:`D=0`. |
|
|
|
Theoretically, hypothesis tests based on Kendall's :math:`tau` and Somers' |
|
:math:`D` should be identical. |
|
However, the *p*-values returned by `kendalltau` are based |
|
on the null hypothesis of *independence* between :math:`X` and :math:`Y` |
|
(i.e. the population from which pairs in :math:`X` and :math:`Y` are |
|
sampled contains equal numbers of all possible pairs), which is more |
|
specific than the null hypothesis :math:`D=0` used here. If the null |
|
hypothesis of independence is desired, it is acceptable to use the |
|
*p*-value returned by `kendalltau` with the statistic returned by |
|
`somersd` and vice versa. For more information, see [2]_. |
|
|
|
Contingency tables are formatted according to the convention used by |
|
SAS and R: the first ranking supplied (``x``) is the "row" variable, and |
|
the second ranking supplied (``y``) is the "column" variable. This is |
|
opposite the convention of Somers' original paper [1]_. |
|
|
|
References |
|
---------- |
|
.. [1] Robert H. Somers, "A New Asymmetric Measure of Association for |
|
Ordinal Variables", *American Sociological Review*, Vol. 27, No. 6, |
|
pp. 799--811, 1962. |
|
|
|
.. [2] Morton B. Brown and Jacqueline K. Benedetti, "Sampling Behavior of |
|
Tests for Correlation in Two-Way Contingency Tables", *Journal of |
|
the American Statistical Association* Vol. 72, No. 358, pp. |
|
309--315, 1977. |
|
|
|
.. [3] SAS Institute, Inc., "The FREQ Procedure (Book Excerpt)", |
|
*SAS/STAT 9.2 User's Guide, Second Edition*, SAS Publishing, 2009. |
|
|
|
.. [4] Laerd Statistics, "Somers' d using SPSS Statistics", *SPSS |
|
Statistics Tutorials and Statistical Guides*, |
|
https://statistics.laerd.com/spss-tutorials/somers-d-using-spss-statistics.php, |
|
Accessed July 31, 2020. |
|
|
|
Examples |
|
-------- |
|
We calculate Somers' D for the example given in [4]_, in which a hotel |
|
chain owner seeks to determine the association between hotel room |
|
cleanliness and customer satisfaction. The independent variable, hotel |
|
room cleanliness, is ranked on an ordinal scale: "below average (1)", |
|
"average (2)", or "above average (3)". The dependent variable, customer |
|
satisfaction, is ranked on a second scale: "very dissatisfied (1)", |
|
"moderately dissatisfied (2)", "neither dissatisfied nor satisfied (3)", |
|
"moderately satisfied (4)", or "very satisfied (5)". 189 customers |
|
respond to the survey, and the results are cast into a contingency table |
|
with the hotel room cleanliness as the "row" variable and customer |
|
satisfaction as the "column" variable. |
|
|
|
+-----+-----+-----+-----+-----+-----+ |
|
| | (1) | (2) | (3) | (4) | (5) | |
|
+=====+=====+=====+=====+=====+=====+ |
|
| (1) | 27 | 25 | 14 | 7 | 0 | |
|
+-----+-----+-----+-----+-----+-----+ |
|
| (2) | 7 | 14 | 18 | 35 | 12 | |
|
+-----+-----+-----+-----+-----+-----+ |
|
| (3) | 1 | 3 | 2 | 7 | 17 | |
|
+-----+-----+-----+-----+-----+-----+ |
|
|
|
For example, 27 customers assigned their room a cleanliness ranking of |
|
"below average (1)" and a corresponding satisfaction of "very |
|
dissatisfied (1)". We perform the analysis as follows. |
|
|
|
>>> from scipy.stats import somersd |
|
>>> table = [[27, 25, 14, 7, 0], [7, 14, 18, 35, 12], [1, 3, 2, 7, 17]] |
|
>>> res = somersd(table) |
|
>>> res.statistic |
|
0.6032766111513396 |
|
>>> res.pvalue |
|
1.0007091191074533e-27 |
|
|
|
The value of the Somers' D statistic is approximately 0.6, indicating |
|
a positive correlation between room cleanliness and customer satisfaction |
|
in the sample. |
|
The *p*-value is very small, indicating a very small probability of |
|
observing such an extreme value of the statistic under the null |
|
hypothesis that the statistic of the entire population (from which |
|
our sample of 189 customers is drawn) is zero. This supports the |
|
alternative hypothesis that the true value of Somers' D for the population |
|
is nonzero. |
|
|
|
""" |
|
x, y = np.array(x), np.array(y) |
|
if x.ndim == 1: |
|
if x.size != y.size: |
|
raise ValueError("Rankings must be of equal length.") |
|
table = scipy.stats.contingency.crosstab(x, y)[1] |
|
elif x.ndim == 2: |
|
if np.any(x < 0): |
|
raise ValueError("All elements of the contingency table must be " |
|
"non-negative.") |
|
if np.any(x != x.astype(int)): |
|
raise ValueError("All elements of the contingency table must be " |
|
"integer.") |
|
if x.nonzero()[0].size < 2: |
|
raise ValueError("At least two elements of the contingency table " |
|
"must be nonzero.") |
|
table = x |
|
else: |
|
raise ValueError("x must be either a 1D or 2D array") |
|
|
|
d, p = _somers_d(table.astype(float), alternative) |
|
|
|
|
|
res = SomersDResult(d, p, table) |
|
res.correlation = d |
|
return res |
|
|
|
|
|
|
|
def _all_partitions(nx, ny): |
|
""" |
|
Partition a set of indices into two fixed-length sets in all possible ways |
|
|
|
Partition a set of indices 0 ... nx + ny - 1 into two sets of length nx and |
|
ny in all possible ways (ignoring order of elements). |
|
""" |
|
z = np.arange(nx+ny) |
|
for c in combinations(z, nx): |
|
x = np.array(c) |
|
mask = np.ones(nx+ny, bool) |
|
mask[x] = False |
|
y = z[mask] |
|
yield x, y |
|
|
|
|
|
def _compute_log_combinations(n): |
|
"""Compute all log combination of C(n, k).""" |
|
gammaln_arr = gammaln(np.arange(n + 1) + 1) |
|
return gammaln(n + 1) - gammaln_arr - gammaln_arr[::-1] |
|
|
|
|
|
@dataclass |
|
class BarnardExactResult: |
|
statistic: float |
|
pvalue: float |
|
|
|
|
|
def barnard_exact(table, alternative="two-sided", pooled=True, n=32): |
|
r"""Perform a Barnard exact test on a 2x2 contingency table. |
|
|
|
Parameters |
|
---------- |
|
table : array_like of ints |
|
A 2x2 contingency table. Elements should be non-negative integers. |
|
|
|
alternative : {'two-sided', 'less', 'greater'}, optional |
|
Defines the null and alternative hypotheses. Default is 'two-sided'. |
|
Please see explanations in the Notes section below. |
|
|
|
pooled : bool, optional |
|
Whether to compute score statistic with pooled variance (as in |
|
Student's t-test, for example) or unpooled variance (as in Welch's |
|
t-test). Default is ``True``. |
|
|
|
n : int, optional |
|
Number of sampling points used in the construction of the sampling |
|
method. Note that this argument will automatically be converted to |
|
the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to |
|
select sample points. Default is 32. Must be positive. In most cases, |
|
32 points is enough to reach good precision. More points comes at |
|
performance cost. |
|
|
|
Returns |
|
------- |
|
ber : BarnardExactResult |
|
A result object with the following attributes. |
|
|
|
statistic : float |
|
The Wald statistic with pooled or unpooled variance, depending |
|
on the user choice of `pooled`. |
|
|
|
pvalue : float |
|
P-value, the probability of obtaining a distribution at least as |
|
extreme as the one that was actually observed, assuming that the |
|
null hypothesis is true. |
|
|
|
See Also |
|
-------- |
|
chi2_contingency : Chi-square test of independence of variables in a |
|
contingency table. |
|
fisher_exact : Fisher exact test on a 2x2 contingency table. |
|
boschloo_exact : Boschloo's exact test on a 2x2 contingency table, |
|
which is an uniformly more powerful alternative to Fisher's exact test. |
|
|
|
Notes |
|
----- |
|
Barnard's test is an exact test used in the analysis of contingency |
|
tables. It examines the association of two categorical variables, and |
|
is a more powerful alternative than Fisher's exact test |
|
for 2x2 contingency tables. |
|
|
|
Let's define :math:`X_0` a 2x2 matrix representing the observed sample, |
|
where each column stores the binomial experiment, as in the example |
|
below. Let's also define :math:`p_1, p_2` the theoretical binomial |
|
probabilities for :math:`x_{11}` and :math:`x_{12}`. When using |
|
Barnard exact test, we can assert three different null hypotheses : |
|
|
|
- :math:`H_0 : p_1 \geq p_2` versus :math:`H_1 : p_1 < p_2`, |
|
with `alternative` = "less" |
|
|
|
- :math:`H_0 : p_1 \leq p_2` versus :math:`H_1 : p_1 > p_2`, |
|
with `alternative` = "greater" |
|
|
|
- :math:`H_0 : p_1 = p_2` versus :math:`H_1 : p_1 \neq p_2`, |
|
with `alternative` = "two-sided" (default one) |
|
|
|
In order to compute Barnard's exact test, we are using the Wald |
|
statistic [3]_ with pooled or unpooled variance. |
|
Under the default assumption that both variances are equal |
|
(``pooled = True``), the statistic is computed as: |
|
|
|
.. math:: |
|
|
|
T(X) = \frac{ |
|
\hat{p}_1 - \hat{p}_2 |
|
}{ |
|
\sqrt{ |
|
\hat{p}(1 - \hat{p}) |
|
(\frac{1}{c_1} + |
|
\frac{1}{c_2}) |
|
} |
|
} |
|
|
|
with :math:`\hat{p}_1, \hat{p}_2` and :math:`\hat{p}` the estimator of |
|
:math:`p_1, p_2` and :math:`p`, the latter being the combined probability, |
|
given the assumption that :math:`p_1 = p_2`. |
|
|
|
If this assumption is invalid (``pooled = False``), the statistic is: |
|
|
|
.. math:: |
|
|
|
T(X) = \frac{ |
|
\hat{p}_1 - \hat{p}_2 |
|
}{ |
|
\sqrt{ |
|
\frac{\hat{p}_1 (1 - \hat{p}_1)}{c_1} + |
|
\frac{\hat{p}_2 (1 - \hat{p}_2)}{c_2} |
|
} |
|
} |
|
|
|
The p-value is then computed as: |
|
|
|
.. math:: |
|
|
|
\sum |
|
\binom{c_1}{x_{11}} |
|
\binom{c_2}{x_{12}} |
|
\pi^{x_{11} + x_{12}} |
|
(1 - \pi)^{t - x_{11} - x_{12}} |
|
|
|
where the sum is over all 2x2 contingency tables :math:`X` such that: |
|
* :math:`T(X) \leq T(X_0)` when `alternative` = "less", |
|
* :math:`T(X) \geq T(X_0)` when `alternative` = "greater", or |
|
* :math:`T(X) \geq |T(X_0)|` when `alternative` = "two-sided". |
|
Above, :math:`c_1, c_2` are the sum of the columns 1 and 2, |
|
and :math:`t` the total (sum of the 4 sample's element). |
|
|
|
The returned p-value is the maximum p-value taken over the nuisance |
|
parameter :math:`\pi`, where :math:`0 \leq \pi \leq 1`. |
|
|
|
This function's complexity is :math:`O(n c_1 c_2)`, where `n` is the |
|
number of sample points. |
|
|
|
References |
|
---------- |
|
.. [1] Barnard, G. A. "Significance Tests for 2x2 Tables". *Biometrika*. |
|
34.1/2 (1947): 123-138. :doi:`dpgkg3` |
|
|
|
.. [2] Mehta, Cyrus R., and Pralay Senchaudhuri. "Conditional versus |
|
unconditional exact tests for comparing two binomials." |
|
*Cytel Software Corporation* 675 (2003): 1-5. |
|
|
|
.. [3] "Wald Test". *Wikipedia*. https://en.wikipedia.org/wiki/Wald_test |
|
|
|
Examples |
|
-------- |
|
An example use of Barnard's test is presented in [2]_. |
|
|
|
Consider the following example of a vaccine efficacy study |
|
(Chan, 1998). In a randomized clinical trial of 30 subjects, 15 were |
|
inoculated with a recombinant DNA influenza vaccine and the 15 were |
|
inoculated with a placebo. Twelve of the 15 subjects in the placebo |
|
group (80%) eventually became infected with influenza whereas for the |
|
vaccine group, only 7 of the 15 subjects (47%) became infected. The |
|
data are tabulated as a 2 x 2 table:: |
|
|
|
Vaccine Placebo |
|
Yes 7 12 |
|
No 8 3 |
|
|
|
When working with statistical hypothesis testing, we usually use a |
|
threshold probability or significance level upon which we decide |
|
to reject the null hypothesis :math:`H_0`. Suppose we choose the common |
|
significance level of 5%. |
|
|
|
Our alternative hypothesis is that the vaccine will lower the chance of |
|
becoming infected with the virus; that is, the probability :math:`p_1` of |
|
catching the virus with the vaccine will be *less than* the probability |
|
:math:`p_2` of catching the virus without the vaccine. Therefore, we call |
|
`barnard_exact` with the ``alternative="less"`` option: |
|
|
|
>>> import scipy.stats as stats |
|
>>> res = stats.barnard_exact([[7, 12], [8, 3]], alternative="less") |
|
>>> res.statistic |
|
-1.894 |
|
>>> res.pvalue |
|
0.03407 |
|
|
|
Under the null hypothesis that the vaccine will not lower the chance of |
|
becoming infected, the probability of obtaining test results at least as |
|
extreme as the observed data is approximately 3.4%. Since this p-value is |
|
less than our chosen significance level, we have evidence to reject |
|
:math:`H_0` in favor of the alternative. |
|
|
|
Suppose we had used Fisher's exact test instead: |
|
|
|
>>> _, pvalue = stats.fisher_exact([[7, 12], [8, 3]], alternative="less") |
|
>>> pvalue |
|
0.0640 |
|
|
|
With the same threshold significance of 5%, we would not have been able |
|
to reject the null hypothesis in favor of the alternative. As stated in |
|
[2]_, Barnard's test is uniformly more powerful than Fisher's exact test |
|
because Barnard's test does not condition on any margin. Fisher's test |
|
should only be used when both sets of marginals are fixed. |
|
|
|
""" |
|
if n <= 0: |
|
raise ValueError( |
|
"Number of points `n` must be strictly positive, " |
|
f"found {n!r}" |
|
) |
|
|
|
table = np.asarray(table, dtype=np.int64) |
|
|
|
if not table.shape == (2, 2): |
|
raise ValueError("The input `table` must be of shape (2, 2).") |
|
|
|
if np.any(table < 0): |
|
raise ValueError("All values in `table` must be nonnegative.") |
|
|
|
if 0 in table.sum(axis=0): |
|
|
|
|
|
return BarnardExactResult(np.nan, 1.0) |
|
|
|
total_col_1, total_col_2 = table.sum(axis=0) |
|
|
|
x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(-1, 1) |
|
x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(1, -1) |
|
|
|
|
|
|
|
p1, p2 = x1 / total_col_1, x2 / total_col_2 |
|
|
|
if pooled: |
|
p = (x1 + x2) / (total_col_1 + total_col_2) |
|
variances = p * (1 - p) * (1 / total_col_1 + 1 / total_col_2) |
|
else: |
|
variances = p1 * (1 - p1) / total_col_1 + p2 * (1 - p2) / total_col_2 |
|
|
|
|
|
with np.errstate(divide="ignore", invalid="ignore"): |
|
wald_statistic = np.divide((p1 - p2), np.sqrt(variances)) |
|
|
|
wald_statistic[p1 == p2] = 0 |
|
|
|
wald_stat_obs = wald_statistic[table[0, 0], table[0, 1]] |
|
|
|
if alternative == "two-sided": |
|
index_arr = np.abs(wald_statistic) >= abs(wald_stat_obs) |
|
elif alternative == "less": |
|
index_arr = wald_statistic <= wald_stat_obs |
|
elif alternative == "greater": |
|
index_arr = wald_statistic >= wald_stat_obs |
|
else: |
|
msg = ( |
|
"`alternative` should be one of {'two-sided', 'less', 'greater'}," |
|
f" found {alternative!r}" |
|
) |
|
raise ValueError(msg) |
|
|
|
x1_sum_x2 = x1 + x2 |
|
|
|
x1_log_comb = _compute_log_combinations(total_col_1) |
|
x2_log_comb = _compute_log_combinations(total_col_2) |
|
x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2] |
|
|
|
result = shgo( |
|
_get_binomial_log_p_value_with_nuisance_param, |
|
args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr), |
|
bounds=((0, 1),), |
|
n=n, |
|
sampling_method="sobol", |
|
) |
|
|
|
|
|
|
|
p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1) |
|
return BarnardExactResult(wald_stat_obs, p_value) |
|
|
|
|
|
@dataclass |
|
class BoschlooExactResult: |
|
statistic: float |
|
pvalue: float |
|
|
|
|
|
def boschloo_exact(table, alternative="two-sided", n=32): |
|
r"""Perform Boschloo's exact test on a 2x2 contingency table. |
|
|
|
Parameters |
|
---------- |
|
table : array_like of ints |
|
A 2x2 contingency table. Elements should be non-negative integers. |
|
|
|
alternative : {'two-sided', 'less', 'greater'}, optional |
|
Defines the null and alternative hypotheses. Default is 'two-sided'. |
|
Please see explanations in the Notes section below. |
|
|
|
n : int, optional |
|
Number of sampling points used in the construction of the sampling |
|
method. Note that this argument will automatically be converted to |
|
the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to |
|
select sample points. Default is 32. Must be positive. In most cases, |
|
32 points is enough to reach good precision. More points comes at |
|
performance cost. |
|
|
|
Returns |
|
------- |
|
ber : BoschlooExactResult |
|
A result object with the following attributes. |
|
|
|
statistic : float |
|
The statistic used in Boschloo's test; that is, the p-value |
|
from Fisher's exact test. |
|
|
|
pvalue : float |
|
P-value, the probability of obtaining a distribution at least as |
|
extreme as the one that was actually observed, assuming that the |
|
null hypothesis is true. |
|
|
|
See Also |
|
-------- |
|
chi2_contingency : Chi-square test of independence of variables in a |
|
contingency table. |
|
fisher_exact : Fisher exact test on a 2x2 contingency table. |
|
barnard_exact : Barnard's exact test, which is a more powerful alternative |
|
than Fisher's exact test for 2x2 contingency tables. |
|
|
|
Notes |
|
----- |
|
Boschloo's test is an exact test used in the analysis of contingency |
|
tables. It examines the association of two categorical variables, and |
|
is a uniformly more powerful alternative to Fisher's exact test |
|
for 2x2 contingency tables. |
|
|
|
Boschloo's exact test uses the p-value of Fisher's exact test as a |
|
statistic, and Boschloo's p-value is the probability under the null |
|
hypothesis of observing such an extreme value of this statistic. |
|
|
|
Let's define :math:`X_0` a 2x2 matrix representing the observed sample, |
|
where each column stores the binomial experiment, as in the example |
|
below. Let's also define :math:`p_1, p_2` the theoretical binomial |
|
probabilities for :math:`x_{11}` and :math:`x_{12}`. When using |
|
Boschloo exact test, we can assert three different alternative hypotheses: |
|
|
|
- :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 < p_2`, |
|
with `alternative` = "less" |
|
|
|
- :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 > p_2`, |
|
with `alternative` = "greater" |
|
|
|
- :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 \neq p_2`, |
|
with `alternative` = "two-sided" (default) |
|
|
|
There are multiple conventions for computing a two-sided p-value when the |
|
null distribution is asymmetric. Here, we apply the convention that the |
|
p-value of a two-sided test is twice the minimum of the p-values of the |
|
one-sided tests (clipped to 1.0). Note that `fisher_exact` follows a |
|
different convention, so for a given `table`, the statistic reported by |
|
`boschloo_exact` may differ from the p-value reported by `fisher_exact` |
|
when ``alternative='two-sided'``. |
|
|
|
.. versionadded:: 1.7.0 |
|
|
|
References |
|
---------- |
|
.. [1] R.D. Boschloo. "Raised conditional level of significance for the |
|
2 x 2-table when testing the equality of two probabilities", |
|
Statistica Neerlandica, 24(1), 1970 |
|
|
|
.. [2] "Boschloo's test", Wikipedia, |
|
https://en.wikipedia.org/wiki/Boschloo%27s_test |
|
|
|
.. [3] Lise M. Saari et al. "Employee attitudes and job satisfaction", |
|
Human Resource Management, 43(4), 395-407, 2004, |
|
:doi:`10.1002/hrm.20032`. |
|
|
|
Examples |
|
-------- |
|
In the following example, we consider the article "Employee |
|
attitudes and job satisfaction" [3]_ |
|
which reports the results of a survey from 63 scientists and 117 college |
|
professors. Of the 63 scientists, 31 said they were very satisfied with |
|
their jobs, whereas 74 of the college professors were very satisfied |
|
with their work. Is this significant evidence that college |
|
professors are happier with their work than scientists? |
|
The following table summarizes the data mentioned above:: |
|
|
|
college professors scientists |
|
Very Satisfied 74 31 |
|
Dissatisfied 43 32 |
|
|
|
When working with statistical hypothesis testing, we usually use a |
|
threshold probability or significance level upon which we decide |
|
to reject the null hypothesis :math:`H_0`. Suppose we choose the common |
|
significance level of 5%. |
|
|
|
Our alternative hypothesis is that college professors are truly more |
|
satisfied with their work than scientists. Therefore, we expect |
|
:math:`p_1` the proportion of very satisfied college professors to be |
|
greater than :math:`p_2`, the proportion of very satisfied scientists. |
|
We thus call `boschloo_exact` with the ``alternative="greater"`` option: |
|
|
|
>>> import scipy.stats as stats |
|
>>> res = stats.boschloo_exact([[74, 31], [43, 32]], alternative="greater") |
|
>>> res.statistic |
|
0.0483 |
|
>>> res.pvalue |
|
0.0355 |
|
|
|
Under the null hypothesis that scientists are happier in their work than |
|
college professors, the probability of obtaining test |
|
results at least as extreme as the observed data is approximately 3.55%. |
|
Since this p-value is less than our chosen significance level, we have |
|
evidence to reject :math:`H_0` in favor of the alternative hypothesis. |
|
|
|
""" |
|
hypergeom = distributions.hypergeom |
|
|
|
if n <= 0: |
|
raise ValueError( |
|
"Number of points `n` must be strictly positive," |
|
f" found {n!r}" |
|
) |
|
|
|
table = np.asarray(table, dtype=np.int64) |
|
|
|
if not table.shape == (2, 2): |
|
raise ValueError("The input `table` must be of shape (2, 2).") |
|
|
|
if np.any(table < 0): |
|
raise ValueError("All values in `table` must be nonnegative.") |
|
|
|
if 0 in table.sum(axis=0): |
|
|
|
|
|
return BoschlooExactResult(np.nan, np.nan) |
|
|
|
total_col_1, total_col_2 = table.sum(axis=0) |
|
total = total_col_1 + total_col_2 |
|
x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(1, -1) |
|
x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(-1, 1) |
|
x1_sum_x2 = x1 + x2 |
|
|
|
if alternative == 'less': |
|
pvalues = hypergeom.cdf(x1, total, x1_sum_x2, total_col_1).T |
|
elif alternative == 'greater': |
|
|
|
pvalues = hypergeom.cdf(x2, total, x1_sum_x2, total_col_2).T |
|
elif alternative == 'two-sided': |
|
boschloo_less = boschloo_exact(table, alternative="less", n=n) |
|
boschloo_greater = boschloo_exact(table, alternative="greater", n=n) |
|
|
|
res = ( |
|
boschloo_less if boschloo_less.pvalue < boschloo_greater.pvalue |
|
else boschloo_greater |
|
) |
|
|
|
|
|
|
|
pvalue = np.clip(2 * res.pvalue, a_min=0, a_max=1) |
|
return BoschlooExactResult(res.statistic, pvalue) |
|
else: |
|
msg = ( |
|
f"`alternative` should be one of {'two-sided', 'less', 'greater'}," |
|
f" found {alternative!r}" |
|
) |
|
raise ValueError(msg) |
|
|
|
fisher_stat = pvalues[table[0, 0], table[0, 1]] |
|
|
|
|
|
|
|
|
|
index_arr = pvalues <= fisher_stat * (1+1e-13) |
|
|
|
x1, x2, x1_sum_x2 = x1.T, x2.T, x1_sum_x2.T |
|
x1_log_comb = _compute_log_combinations(total_col_1) |
|
x2_log_comb = _compute_log_combinations(total_col_2) |
|
x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2] |
|
|
|
result = shgo( |
|
_get_binomial_log_p_value_with_nuisance_param, |
|
args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr), |
|
bounds=((0, 1),), |
|
n=n, |
|
sampling_method="sobol", |
|
) |
|
|
|
|
|
|
|
p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1) |
|
return BoschlooExactResult(fisher_stat, p_value) |
|
|
|
|
|
def _get_binomial_log_p_value_with_nuisance_param( |
|
nuisance_param, x1_sum_x2, x1_sum_x2_log_comb, index_arr |
|
): |
|
r""" |
|
Compute the log pvalue in respect of a nuisance parameter considering |
|
a 2x2 sample space. |
|
|
|
Parameters |
|
---------- |
|
nuisance_param : float |
|
nuisance parameter used in the computation of the maximisation of |
|
the p-value. Must be between 0 and 1 |
|
|
|
x1_sum_x2 : ndarray |
|
Sum of x1 and x2 inside barnard_exact |
|
|
|
x1_sum_x2_log_comb : ndarray |
|
sum of the log combination of x1 and x2 |
|
|
|
index_arr : ndarray of boolean |
|
|
|
Returns |
|
------- |
|
p_value : float |
|
Return the maximum p-value considering every nuisance parameter |
|
between 0 and 1 |
|
|
|
Notes |
|
----- |
|
|
|
Both Barnard's test and Boschloo's test iterate over a nuisance parameter |
|
:math:`\pi \in [0, 1]` to find the maximum p-value. To search this |
|
maxima, this function return the negative log pvalue with respect to the |
|
nuisance parameter passed in params. This negative log p-value is then |
|
used in `shgo` to find the minimum negative pvalue which is our maximum |
|
pvalue. |
|
|
|
Also, to compute the different combination used in the |
|
p-values' computation formula, this function uses `gammaln` which is |
|
more tolerant for large value than `scipy.special.comb`. `gammaln` gives |
|
a log combination. For the little precision loss, performances are |
|
improved a lot. |
|
""" |
|
t1, t2 = x1_sum_x2.shape |
|
n = t1 + t2 - 2 |
|
with np.errstate(divide="ignore", invalid="ignore"): |
|
log_nuisance = np.log( |
|
nuisance_param, |
|
out=np.zeros_like(nuisance_param), |
|
where=nuisance_param >= 0, |
|
) |
|
log_1_minus_nuisance = np.log( |
|
1 - nuisance_param, |
|
out=np.zeros_like(nuisance_param), |
|
where=1 - nuisance_param >= 0, |
|
) |
|
|
|
nuisance_power_x1_x2 = log_nuisance * x1_sum_x2 |
|
nuisance_power_x1_x2[(x1_sum_x2 == 0)[:, :]] = 0 |
|
|
|
nuisance_power_n_minus_x1_x2 = log_1_minus_nuisance * (n - x1_sum_x2) |
|
nuisance_power_n_minus_x1_x2[(x1_sum_x2 == n)[:, :]] = 0 |
|
|
|
tmp_log_values_arr = ( |
|
x1_sum_x2_log_comb |
|
+ nuisance_power_x1_x2 |
|
+ nuisance_power_n_minus_x1_x2 |
|
) |
|
|
|
tmp_values_from_index = tmp_log_values_arr[index_arr] |
|
|
|
|
|
|
|
max_value = tmp_values_from_index.max() |
|
|
|
|
|
|
|
|
|
|
|
with np.errstate(divide="ignore", invalid="ignore"): |
|
log_probs = np.exp(tmp_values_from_index - max_value).sum() |
|
log_pvalue = max_value + np.log( |
|
log_probs, |
|
out=np.full_like(log_probs, -np.inf), |
|
where=log_probs > 0, |
|
) |
|
|
|
|
|
return -log_pvalue |
|
|
|
|
|
def _pval_cvm_2samp_exact(s, m, n): |
|
""" |
|
Compute the exact p-value of the Cramer-von Mises two-sample test |
|
for a given value s of the test statistic. |
|
m and n are the sizes of the samples. |
|
|
|
[1] Y. Xiao, A. Gordon, and A. Yakovlev, "A C++ Program for |
|
the Cramér-Von Mises Two-Sample Test", J. Stat. Soft., |
|
vol. 17, no. 8, pp. 1-15, Dec. 2006. |
|
[2] T. W. Anderson "On the Distribution of the Two-Sample Cramer-von Mises |
|
Criterion," The Annals of Mathematical Statistics, Ann. Math. Statist. |
|
33(3), 1148-1159, (September, 1962) |
|
""" |
|
|
|
|
|
lcm = np.lcm(m, n) |
|
|
|
a = lcm // m |
|
b = lcm // n |
|
|
|
|
|
mn = m * n |
|
zeta = lcm ** 2 * (m + n) * (6 * s - mn * (4 * mn - 1)) // (6 * mn ** 2) |
|
|
|
|
|
zeta_bound = lcm**2 * (m + n) |
|
combinations = comb(m + n, m) |
|
max_gs = max(zeta_bound, combinations) |
|
dtype = np.min_scalar_type(max_gs) |
|
|
|
|
|
gs = ([np.array([[0], [1]], dtype=dtype)] |
|
+ [np.empty((2, 0), dtype=dtype) for _ in range(m)]) |
|
for u in range(n + 1): |
|
next_gs = [] |
|
tmp = np.empty((2, 0), dtype=dtype) |
|
for v, g in enumerate(gs): |
|
|
|
|
|
vi, i0, i1 = np.intersect1d(tmp[0], g[0], return_indices=True) |
|
tmp = np.concatenate([ |
|
np.stack([vi, tmp[1, i0] + g[1, i1]]), |
|
np.delete(tmp, i0, 1), |
|
np.delete(g, i1, 1) |
|
], 1) |
|
res = (a * v - b * u) ** 2 |
|
tmp[0] += res.astype(dtype) |
|
next_gs.append(tmp) |
|
gs = next_gs |
|
value, freq = gs[m] |
|
return np.float64(np.sum(freq[value >= zeta]) / combinations) |
|
|
|
|
|
@_axis_nan_policy_factory(CramerVonMisesResult, n_samples=2, too_small=1, |
|
result_to_tuple=_cvm_result_to_tuple) |
|
def cramervonmises_2samp(x, y, method='auto'): |
|
"""Perform the two-sample Cramér-von Mises test for goodness of fit. |
|
|
|
This is the two-sample version of the Cramér-von Mises test ([1]_): |
|
for two independent samples :math:`X_1, ..., X_n` and |
|
:math:`Y_1, ..., Y_m`, the null hypothesis is that the samples |
|
come from the same (unspecified) continuous distribution. |
|
|
|
Parameters |
|
---------- |
|
x : array_like |
|
A 1-D array of observed values of the random variables :math:`X_i`. |
|
Must contain at least two observations. |
|
y : array_like |
|
A 1-D array of observed values of the random variables :math:`Y_i`. |
|
Must contain at least two observations. |
|
method : {'auto', 'asymptotic', 'exact'}, optional |
|
The method used to compute the p-value, see Notes for details. |
|
The default is 'auto'. |
|
|
|
Returns |
|
------- |
|
res : object with attributes |
|
statistic : float |
|
Cramér-von Mises statistic. |
|
pvalue : float |
|
The p-value. |
|
|
|
See Also |
|
-------- |
|
cramervonmises, anderson_ksamp, epps_singleton_2samp, ks_2samp |
|
|
|
Notes |
|
----- |
|
.. versionadded:: 1.7.0 |
|
|
|
The statistic is computed according to equation 9 in [2]_. The |
|
calculation of the p-value depends on the keyword `method`: |
|
|
|
- ``asymptotic``: The p-value is approximated by using the limiting |
|
distribution of the test statistic. |
|
- ``exact``: The exact p-value is computed by enumerating all |
|
possible combinations of the test statistic, see [2]_. |
|
|
|
If ``method='auto'``, the exact approach is used |
|
if both samples contain equal to or less than 20 observations, |
|
otherwise the asymptotic distribution is used. |
|
|
|
If the underlying distribution is not continuous, the p-value is likely to |
|
be conservative (Section 6.2 in [3]_). When ranking the data to compute |
|
the test statistic, midranks are used if there are ties. |
|
|
|
References |
|
---------- |
|
.. [1] https://en.wikipedia.org/wiki/Cramer-von_Mises_criterion |
|
.. [2] Anderson, T.W. (1962). On the distribution of the two-sample |
|
Cramer-von-Mises criterion. The Annals of Mathematical |
|
Statistics, pp. 1148-1159. |
|
.. [3] Conover, W.J., Practical Nonparametric Statistics, 1971. |
|
|
|
Examples |
|
-------- |
|
|
|
Suppose we wish to test whether two samples generated by |
|
``scipy.stats.norm.rvs`` have the same distribution. We choose a |
|
significance level of alpha=0.05. |
|
|
|
>>> import numpy as np |
|
>>> from scipy import stats |
|
>>> rng = np.random.default_rng() |
|
>>> x = stats.norm.rvs(size=100, random_state=rng) |
|
>>> y = stats.norm.rvs(size=70, random_state=rng) |
|
>>> res = stats.cramervonmises_2samp(x, y) |
|
>>> res.statistic, res.pvalue |
|
(0.29376470588235293, 0.1412873014573014) |
|
|
|
The p-value exceeds our chosen significance level, so we do not |
|
reject the null hypothesis that the observed samples are drawn from the |
|
same distribution. |
|
|
|
For small sample sizes, one can compute the exact p-values: |
|
|
|
>>> x = stats.norm.rvs(size=7, random_state=rng) |
|
>>> y = stats.t.rvs(df=2, size=6, random_state=rng) |
|
>>> res = stats.cramervonmises_2samp(x, y, method='exact') |
|
>>> res.statistic, res.pvalue |
|
(0.197802197802198, 0.31643356643356646) |
|
|
|
The p-value based on the asymptotic distribution is a good approximation |
|
even though the sample size is small. |
|
|
|
>>> res = stats.cramervonmises_2samp(x, y, method='asymptotic') |
|
>>> res.statistic, res.pvalue |
|
(0.197802197802198, 0.2966041181527128) |
|
|
|
Independent of the method, one would not reject the null hypothesis at the |
|
chosen significance level in this example. |
|
|
|
""" |
|
xa = np.sort(np.asarray(x)) |
|
ya = np.sort(np.asarray(y)) |
|
|
|
if xa.size <= 1 or ya.size <= 1: |
|
raise ValueError('x and y must contain at least two observations.') |
|
if method not in ['auto', 'exact', 'asymptotic']: |
|
raise ValueError('method must be either auto, exact or asymptotic.') |
|
|
|
nx = len(xa) |
|
ny = len(ya) |
|
|
|
if method == 'auto': |
|
if max(nx, ny) > 20: |
|
method = 'asymptotic' |
|
else: |
|
method = 'exact' |
|
|
|
|
|
z = np.concatenate([xa, ya]) |
|
|
|
r = scipy.stats.rankdata(z, method='average') |
|
rx = r[:nx] |
|
ry = r[nx:] |
|
|
|
|
|
u = nx * np.sum((rx - np.arange(1, nx+1))**2) |
|
u += ny * np.sum((ry - np.arange(1, ny+1))**2) |
|
|
|
|
|
k, N = nx*ny, nx + ny |
|
t = u / (k*N) - (4*k - 1)/(6*N) |
|
|
|
if method == 'exact': |
|
p = _pval_cvm_2samp_exact(u, nx, ny) |
|
else: |
|
|
|
et = (1 + 1/N)/6 |
|
vt = (N+1) * (4*k*N - 3*(nx**2 + ny**2) - 2*k) |
|
vt = vt / (45 * N**2 * 4 * k) |
|
|
|
|
|
tn = 1/6 + (t - et) / np.sqrt(45 * vt) |
|
|
|
|
|
|
|
|
|
if tn < 0.003: |
|
p = 1.0 |
|
else: |
|
p = max(0, 1. - _cdf_cvm_inf(tn)) |
|
|
|
return CramerVonMisesResult(statistic=t, pvalue=p) |
|
|
|
|
|
class TukeyHSDResult: |
|
"""Result of `scipy.stats.tukey_hsd`. |
|
|
|
Attributes |
|
---------- |
|
statistic : float ndarray |
|
The computed statistic of the test for each comparison. The element |
|
at index ``(i, j)`` is the statistic for the comparison between groups |
|
``i`` and ``j``. |
|
pvalue : float ndarray |
|
The associated p-value from the studentized range distribution. The |
|
element at index ``(i, j)`` is the p-value for the comparison |
|
between groups ``i`` and ``j``. |
|
|
|
Notes |
|
----- |
|
The string representation of this object displays the most recently |
|
calculated confidence interval, and if none have been previously |
|
calculated, it will evaluate ``confidence_interval()``. |
|
|
|
References |
|
---------- |
|
.. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's |
|
Method." |
|
https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm, |
|
28 November 2020. |
|
""" |
|
|
|
def __init__(self, statistic, pvalue, _nobs, _ntreatments, _stand_err): |
|
self.statistic = statistic |
|
self.pvalue = pvalue |
|
self._ntreatments = _ntreatments |
|
self._nobs = _nobs |
|
self._stand_err = _stand_err |
|
self._ci = None |
|
self._ci_cl = None |
|
|
|
def __str__(self): |
|
|
|
|
|
|
|
if self._ci is None: |
|
self.confidence_interval(confidence_level=.95) |
|
s = ("Tukey's HSD Pairwise Group Comparisons" |
|
f" ({self._ci_cl*100:.1f}% Confidence Interval)\n") |
|
s += "Comparison Statistic p-value Lower CI Upper CI\n" |
|
for i in range(self.pvalue.shape[0]): |
|
for j in range(self.pvalue.shape[0]): |
|
if i != j: |
|
s += (f" ({i} - {j}) {self.statistic[i, j]:>10.3f}" |
|
f"{self.pvalue[i, j]:>10.3f}" |
|
f"{self._ci.low[i, j]:>10.3f}" |
|
f"{self._ci.high[i, j]:>10.3f}\n") |
|
return s |
|
|
|
def confidence_interval(self, confidence_level=.95): |
|
"""Compute the confidence interval for the specified confidence level. |
|
|
|
Parameters |
|
---------- |
|
confidence_level : float, optional |
|
Confidence level for the computed confidence interval |
|
of the estimated proportion. Default is .95. |
|
|
|
Returns |
|
------- |
|
ci : ``ConfidenceInterval`` object |
|
The object has attributes ``low`` and ``high`` that hold the |
|
lower and upper bounds of the confidence intervals for each |
|
comparison. The high and low values are accessible for each |
|
comparison at index ``(i, j)`` between groups ``i`` and ``j``. |
|
|
|
References |
|
---------- |
|
.. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. |
|
Tukey's Method." |
|
https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm, |
|
28 November 2020. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.stats import tukey_hsd |
|
>>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9] |
|
>>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1] |
|
>>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8] |
|
>>> result = tukey_hsd(group0, group1, group2) |
|
>>> ci = result.confidence_interval() |
|
>>> ci.low |
|
array([[-3.649159, -8.249159, -3.909159], |
|
[ 0.950841, -3.649159, 0.690841], |
|
[-3.389159, -7.989159, -3.649159]]) |
|
>>> ci.high |
|
array([[ 3.649159, -0.950841, 3.389159], |
|
[ 8.249159, 3.649159, 7.989159], |
|
[ 3.909159, -0.690841, 3.649159]]) |
|
""" |
|
|
|
|
|
if (self._ci is not None and self._ci_cl is not None and |
|
confidence_level == self._ci_cl): |
|
return self._ci |
|
|
|
if not 0 < confidence_level < 1: |
|
raise ValueError("Confidence level must be between 0 and 1.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
params = (confidence_level, self._nobs, self._ntreatments - self._nobs) |
|
srd = distributions.studentized_range.ppf(*params) |
|
|
|
|
|
|
|
tukey_criterion = srd * self._stand_err |
|
|
|
|
|
upper_conf = self.statistic + tukey_criterion |
|
lower_conf = self.statistic - tukey_criterion |
|
self._ci = ConfidenceInterval(low=lower_conf, high=upper_conf) |
|
self._ci_cl = confidence_level |
|
return self._ci |
|
|
|
|
|
def _tukey_hsd_iv(args): |
|
if (len(args)) < 2: |
|
raise ValueError("There must be more than 1 treatment.") |
|
args = [np.asarray(arg) for arg in args] |
|
for arg in args: |
|
if arg.ndim != 1: |
|
raise ValueError("Input samples must be one-dimensional.") |
|
if arg.size <= 1: |
|
raise ValueError("Input sample size must be greater than one.") |
|
if np.isinf(arg).any(): |
|
raise ValueError("Input samples must be finite.") |
|
return args |
|
|
|
|
|
def tukey_hsd(*args): |
|
"""Perform Tukey's HSD test for equality of means over multiple treatments. |
|
|
|
Tukey's honestly significant difference (HSD) test performs pairwise |
|
comparison of means for a set of samples. Whereas ANOVA (e.g. `f_oneway`) |
|
assesses whether the true means underlying each sample are identical, |
|
Tukey's HSD is a post hoc test used to compare the mean of each sample |
|
to the mean of each other sample. |
|
|
|
The null hypothesis is that the distributions underlying the samples all |
|
have the same mean. The test statistic, which is computed for every |
|
possible pairing of samples, is simply the difference between the sample |
|
means. For each pair, the p-value is the probability under the null |
|
hypothesis (and other assumptions; see notes) of observing such an extreme |
|
value of the statistic, considering that many pairwise comparisons are |
|
being performed. Confidence intervals for the difference between each pair |
|
of means are also available. |
|
|
|
Parameters |
|
---------- |
|
sample1, sample2, ... : array_like |
|
The sample measurements for each group. There must be at least |
|
two arguments. |
|
|
|
Returns |
|
------- |
|
result : `~scipy.stats._result_classes.TukeyHSDResult` instance |
|
The return value is an object with the following attributes: |
|
|
|
statistic : float ndarray |
|
The computed statistic of the test for each comparison. The element |
|
at index ``(i, j)`` is the statistic for the comparison between |
|
groups ``i`` and ``j``. |
|
pvalue : float ndarray |
|
The computed p-value of the test for each comparison. The element |
|
at index ``(i, j)`` is the p-value for the comparison between |
|
groups ``i`` and ``j``. |
|
|
|
The object has the following methods: |
|
|
|
confidence_interval(confidence_level=0.95): |
|
Compute the confidence interval for the specified confidence level. |
|
|
|
See Also |
|
-------- |
|
dunnett : performs comparison of means against a control group. |
|
|
|
Notes |
|
----- |
|
The use of this test relies on several assumptions. |
|
|
|
1. The observations are independent within and among groups. |
|
2. The observations within each group are normally distributed. |
|
3. The distributions from which the samples are drawn have the same finite |
|
variance. |
|
|
|
The original formulation of the test was for samples of equal size [6]_. |
|
In case of unequal sample sizes, the test uses the Tukey-Kramer method |
|
[4]_. |
|
|
|
References |
|
---------- |
|
.. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's |
|
Method." |
|
https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm, |
|
28 November 2020. |
|
.. [2] Abdi, Herve & Williams, Lynne. (2021). "Tukey's Honestly Significant |
|
Difference (HSD) Test." |
|
https://personal.utdallas.edu/~herve/abdi-HSD2010-pretty.pdf |
|
.. [3] "One-Way ANOVA Using SAS PROC ANOVA & PROC GLM." SAS |
|
Tutorials, 2007, www.stattutorials.com/SAS/TUTORIAL-PROC-GLM.htm. |
|
.. [4] Kramer, Clyde Young. "Extension of Multiple Range Tests to Group |
|
Means with Unequal Numbers of Replications." Biometrics, vol. 12, |
|
no. 3, 1956, pp. 307-310. JSTOR, www.jstor.org/stable/3001469. |
|
Accessed 25 May 2021. |
|
.. [5] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.3.3. |
|
The ANOVA table and tests of hypotheses about means" |
|
https://www.itl.nist.gov/div898/handbook/prc/section4/prc433.htm, |
|
2 June 2021. |
|
.. [6] Tukey, John W. "Comparing Individual Means in the Analysis of |
|
Variance." Biometrics, vol. 5, no. 2, 1949, pp. 99-114. JSTOR, |
|
www.jstor.org/stable/3001913. Accessed 14 June 2021. |
|
|
|
|
|
Examples |
|
-------- |
|
Here are some data comparing the time to relief of three brands of |
|
headache medicine, reported in minutes. Data adapted from [3]_. |
|
|
|
>>> import numpy as np |
|
>>> from scipy.stats import tukey_hsd |
|
>>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9] |
|
>>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1] |
|
>>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8] |
|
|
|
We would like to see if the means between any of the groups are |
|
significantly different. First, visually examine a box and whisker plot. |
|
|
|
>>> import matplotlib.pyplot as plt |
|
>>> fig, ax = plt.subplots(1, 1) |
|
>>> ax.boxplot([group0, group1, group2]) |
|
>>> ax.set_xticklabels(["group0", "group1", "group2"]) # doctest: +SKIP |
|
>>> ax.set_ylabel("mean") # doctest: +SKIP |
|
>>> plt.show() |
|
|
|
From the box and whisker plot, we can see overlap in the interquartile |
|
ranges group 1 to group 2 and group 3, but we can apply the ``tukey_hsd`` |
|
test to determine if the difference between means is significant. We |
|
set a significance level of .05 to reject the null hypothesis. |
|
|
|
>>> res = tukey_hsd(group0, group1, group2) |
|
>>> print(res) |
|
Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval) |
|
Comparison Statistic p-value Lower CI Upper CI |
|
(0 - 1) -4.600 0.014 -8.249 -0.951 |
|
(0 - 2) -0.260 0.980 -3.909 3.389 |
|
(1 - 0) 4.600 0.014 0.951 8.249 |
|
(1 - 2) 4.340 0.020 0.691 7.989 |
|
(2 - 0) 0.260 0.980 -3.389 3.909 |
|
(2 - 1) -4.340 0.020 -7.989 -0.691 |
|
|
|
The null hypothesis is that each group has the same mean. The p-value for |
|
comparisons between ``group0`` and ``group1`` as well as ``group1`` and |
|
``group2`` do not exceed .05, so we reject the null hypothesis that they |
|
have the same means. The p-value of the comparison between ``group0`` |
|
and ``group2`` exceeds .05, so we accept the null hypothesis that there |
|
is not a significant difference between their means. |
|
|
|
We can also compute the confidence interval associated with our chosen |
|
confidence level. |
|
|
|
>>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9] |
|
>>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1] |
|
>>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8] |
|
>>> result = tukey_hsd(group0, group1, group2) |
|
>>> conf = res.confidence_interval(confidence_level=.99) |
|
>>> for ((i, j), l) in np.ndenumerate(conf.low): |
|
... # filter out self comparisons |
|
... if i != j: |
|
... h = conf.high[i,j] |
|
... print(f"({i} - {j}) {l:>6.3f} {h:>6.3f}") |
|
(0 - 1) -9.480 0.280 |
|
(0 - 2) -5.140 4.620 |
|
(1 - 0) -0.280 9.480 |
|
(1 - 2) -0.540 9.220 |
|
(2 - 0) -4.620 5.140 |
|
(2 - 1) -9.220 0.540 |
|
""" |
|
args = _tukey_hsd_iv(args) |
|
ntreatments = len(args) |
|
means = np.asarray([np.mean(arg) for arg in args]) |
|
nsamples_treatments = np.asarray([a.size for a in args]) |
|
nobs = np.sum(nsamples_treatments) |
|
|
|
|
|
|
|
mse = (np.sum([np.var(arg, ddof=1) for arg in args] * |
|
(nsamples_treatments - 1)) / (nobs - ntreatments)) |
|
|
|
|
|
|
|
if np.unique(nsamples_treatments).size == 1: |
|
|
|
|
|
normalize = 2 / nsamples_treatments[0] |
|
else: |
|
|
|
|
|
|
|
normalize = 1 / nsamples_treatments + 1 / nsamples_treatments[None].T |
|
|
|
|
|
|
|
stand_err = np.sqrt(normalize * mse / 2) |
|
|
|
|
|
mean_differences = means[None].T - means |
|
|
|
|
|
|
|
t_stat = np.abs(mean_differences) / stand_err |
|
|
|
params = t_stat, ntreatments, nobs - ntreatments |
|
pvalues = distributions.studentized_range.sf(*params) |
|
|
|
return TukeyHSDResult(mean_differences, pvalues, ntreatments, |
|
nobs, stand_err) |
|
|