Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

10.9 kB

	import numpy as np
	from numpy.testing import (assert_equal, assert_array_equal,
	assert_array_almost_equal, assert_approx_equal,
	assert_allclose)
	import pytest
	from pytest import raises as assert_raises
	from scipy import stats
	from scipy.special import xlogy
	from scipy.stats.contingency import (margins, expected_freq,
	chi2_contingency, association)


	def test_margins():
	a = np.array([1])
	m = margins(a)
	assert_equal(len(m), 1)
	m0 = m[0]
	assert_array_equal(m0, np.array([1]))

	a = np.array([[1]])
	m0, m1 = margins(a)
	expected0 = np.array([[1]])
	expected1 = np.array([[1]])
	assert_array_equal(m0, expected0)
	assert_array_equal(m1, expected1)

	a = np.arange(12).reshape(2, 6)
	m0, m1 = margins(a)
	expected0 = np.array([[15], [51]])
	expected1 = np.array([[6, 8, 10, 12, 14, 16]])
	assert_array_equal(m0, expected0)
	assert_array_equal(m1, expected1)

	a = np.arange(24).reshape(2, 3, 4)
	m0, m1, m2 = margins(a)
	expected0 = np.array([[[66]], [[210]]])
	expected1 = np.array([[[60], [92], [124]]])
	expected2 = np.array([[[60, 66, 72, 78]]])
	assert_array_equal(m0, expected0)
	assert_array_equal(m1, expected1)
	assert_array_equal(m2, expected2)


	def test_expected_freq():
	assert_array_equal(expected_freq([1]), np.array([1.0]))

	observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
	e = expected_freq(observed)
	assert_array_equal(e, np.ones_like(observed))

	observed = np.array([[10, 10, 20], [20, 20, 20]])
	e = expected_freq(observed)
	correct = np.array([[12., 12., 16.], [18., 18., 24.]])
	assert_array_almost_equal(e, correct)


	class TestChi2Contingency:
	def test_chi2_contingency_trivial(self):
	# Some very simple tests for chi2_contingency.

	# A trivial case
	obs = np.array([[1, 2], [1, 2]])
	chi2, p, dof, expected = chi2_contingency(obs, correction=False)
	assert_equal(chi2, 0.0)
	assert_equal(p, 1.0)
	assert_equal(dof, 1)
	assert_array_equal(obs, expected)

	# A really trivial case: 1-D data.
	obs = np.array([1, 2, 3])
	chi2, p, dof, expected = chi2_contingency(obs, correction=False)
	assert_equal(chi2, 0.0)
	assert_equal(p, 1.0)
	assert_equal(dof, 0)
	assert_array_equal(obs, expected)

	def test_chi2_contingency_R(self):
	# Some test cases that were computed independently, using R.

	# Rcode = \
	# """
	# # Data vector.
	# data <- c(
	# 12, 34, 23, 4, 47, 11,
	# 35, 31, 11, 34, 10, 18,
	# 12, 32, 9, 18, 13, 19,
	# 12, 12, 14, 9, 33, 25
	# )
	#
	# # Create factor tags:r=rows, c=columns, t=tiers
	# r <- factor(gl(4, 23, 23*4, labels=c("r1", "r2", "r3", "r4")))
	# c <- factor(gl(3, 1, 234, labels=c("c1", "c2", "c3")))
	# t <- factor(gl(2, 3, 234, labels=c("t1", "t2")))
	#
	# # 3-way Chi squared test of independence
	# s = summary(xtabs(data~r+c+t))
	# print(s)
	# """
	# Routput = \
	# """
	# Call: xtabs(formula = data ~ r + c + t)
	# Number of cases in table: 478
	# Number of factors: 3
	# Test for independence of all factors:
	# Chisq = 102.17, df = 17, p-value = 3.514e-14
	# """
	obs = np.array(
	[[[12, 34, 23],
	[35, 31, 11],
	[12, 32, 9],
	[12, 12, 14]],
	[[4, 47, 11],
	[34, 10, 18],
	[18, 13, 19],
	[9, 33, 25]]])
	chi2, p, dof, expected = chi2_contingency(obs)
	assert_approx_equal(chi2, 102.17, significant=5)
	assert_approx_equal(p, 3.514e-14, significant=4)
	assert_equal(dof, 17)

	# Rcode = \
	# """
	# # Data vector.
	# data <- c(
	# #
	# 12, 17,
	# 11, 16,
	# #
	# 11, 12,
	# 15, 16,
	# #
	# 23, 15,
	# 30, 22,
	# #
	# 14, 17,
	# 15, 16
	# )
	#
	# # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
	# r <- factor(gl(2, 2, 222*2, labels=c("r1", "r2")))
	# c <- factor(gl(2, 1, 222*2, labels=c("c1", "c2")))
	# d <- factor(gl(2, 4, 222*2, labels=c("d1", "d2")))
	# t <- factor(gl(2, 8, 222*2, labels=c("t1", "t2")))
	#
	# # 4-way Chi squared test of independence
	# s = summary(xtabs(data~r+c+d+t))
	# print(s)
	# """
	# Routput = \
	# """
	# Call: xtabs(formula = data ~ r + c + d + t)
	# Number of cases in table: 262
	# Number of factors: 4
	# Test for independence of all factors:
	# Chisq = 8.758, df = 11, p-value = 0.6442
	# """
	obs = np.array(
	[[[[12, 17],
	[11, 16]],
	[[11, 12],
	[15, 16]]],
	[[[23, 15],
	[30, 22]],
	[[14, 17],
	[15, 16]]]])
	chi2, p, dof, expected = chi2_contingency(obs)
	assert_approx_equal(chi2, 8.758, significant=4)
	assert_approx_equal(p, 0.6442, significant=4)
	assert_equal(dof, 11)

	def test_chi2_contingency_g(self):
	c = np.array([[15, 60], [15, 90]])
	g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
	correction=False)
	assert_allclose(g, 2*xlogy(c, c/e).sum())

	g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
	correction=True)
	c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
	assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())

	c = np.array([[10, 12, 10], [12, 10, 10]])
	g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
	assert_allclose(g, 2*xlogy(c, c/e).sum())

	def test_chi2_contingency_bad_args(self):
	# Test that "bad" inputs raise a ValueError.

	# Negative value in the array of observed frequencies.
	obs = np.array([[-1, 10], [1, 2]])
	assert_raises(ValueError, chi2_contingency, obs)

	# The zeros in this will result in zeros in the array
	# of expected frequencies.
	obs = np.array([[0, 1], [0, 1]])
	assert_raises(ValueError, chi2_contingency, obs)

	# A degenerate case: `observed` has size 0.
	obs = np.empty((0, 8))
	assert_raises(ValueError, chi2_contingency, obs)

	def test_chi2_contingency_yates_gh13875(self):
	# Magnitude of Yates' continuity correction should not exceed difference
	# between expected and observed value of the statistic; see gh-13875
	observed = np.array([[1573, 3], [4, 0]])
	p = chi2_contingency(observed)[1]
	assert_allclose(p, 1, rtol=1e-12)

	@pytest.mark.parametrize("correction", [False, True])
	def test_result(self, correction):
	obs = np.array([[1, 2], [1, 2]])
	res = chi2_contingency(obs, correction=correction)
	assert_equal((res.statistic, res.pvalue, res.dof, res.expected_freq), res)

	@pytest.mark.slow
	def test_exact_permutation(self):
	table = np.arange(4).reshape(2, 2)
	ref_statistic = chi2_contingency(table, correction=False).statistic
	ref_pvalue = stats.fisher_exact(table).pvalue
	method = stats.PermutationMethod(n_resamples=50000)
	res = chi2_contingency(table, correction=False, method=method)
	assert_equal(res.statistic, ref_statistic)
	assert_allclose(res.pvalue, ref_pvalue, rtol=1e-15)

	@pytest.mark.slow
	@pytest.mark.parametrize('method', (stats.PermutationMethod,
	stats.MonteCarloMethod))
	def test_resampling_randomized(self, method):
	rng = np.random.default_rng(2592340925)
	# need to have big sum for asymptotic approximation to be good
	rows = [300, 1000, 800]
	cols = [200, 400, 800, 700]
	table = stats.random_table(rows, cols, seed=rng).rvs()
	res = chi2_contingency(table, correction=False, method=method(rng=rng))
	ref = chi2_contingency(table, correction=False)
	assert_equal(res.statistic, ref.statistic)
	assert_allclose(res.pvalue, ref.pvalue, atol=5e-3)
	assert_equal(res.dof, np.nan)
	assert_equal(res.expected_freq, ref.expected_freq)

	def test_resampling_invalid_args(self):
	table = np.arange(8).reshape(2, 2, 2)

	method = stats.PermutationMethod()
	message = "Use of `method` is only compatible with two-way tables."
	with pytest.raises(ValueError, match=message):
	chi2_contingency(table, correction=False, method=method)

	table = np.arange(4).reshape(2, 2)

	method = stats.PermutationMethod()
	message = "`correction=True` is not compatible with..."
	with pytest.raises(ValueError, match=message):
	chi2_contingency(table, method=method)

	method = stats.MonteCarloMethod()
	message = "`lambda_=2` is not compatible with..."
	with pytest.raises(ValueError, match=message):
	chi2_contingency(table, correction=False, lambda_=2, method=method)

	method = 'herring'
	message = "`method='herring'` not recognized; if provided, `method`..."
	with pytest.raises(ValueError, match=message):
	chi2_contingency(table, correction=False, method=method)

	method = stats.MonteCarloMethod(rvs=stats.norm.rvs)
	message = "If the `method` argument of `chi2_contingency` is..."
	with pytest.raises(ValueError, match=message):
	chi2_contingency(table, correction=False, method=method)


	def test_bad_association_args():
	# Invalid Test Statistic
	assert_raises(ValueError, association, [[1, 2], [3, 4]], "X")
	# Invalid array shape
	assert_raises(ValueError, association, [[[1, 2]], [[3, 4]]], "cramer")
	# chi2_contingency exception
	assert_raises(ValueError, association, [[-1, 10], [1, 2]], 'cramer')
	# Invalid Array Item Data Type
	assert_raises(ValueError, association,
	np.array([[1, 2], ["dd", 4]], dtype=object), 'cramer')


	@pytest.mark.parametrize('stat, expected',
	[('cramer', 0.09222412010290792),
	('tschuprow', 0.0775509319944633),
	('pearson', 0.12932925727138758)])
	def test_assoc(stat, expected):
	# 2d Array
	obs1 = np.array([[12, 13, 14, 15, 16],
	[17, 16, 18, 19, 11],
	[9, 15, 14, 12, 11]])
	a = association(observed=obs1, method=stat)
	assert_allclose(a, expected)