Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

9.57 kB

	import operator
	from dataclasses import dataclass
	import numpy as np
	from scipy.special import ndtri
	from ._common import ConfidenceInterval


	def _validate_int(n, bound, name):
	msg = f'{name} must be an integer not less than {bound}, but got {n!r}'
	try:
	n = operator.index(n)
	except TypeError:
	raise TypeError(msg) from None
	if n < bound:
	raise ValueError(msg)
	return n


	@dataclass
	class RelativeRiskResult:
	"""
	Result of `scipy.stats.contingency.relative_risk`.

	Attributes
	----------
	relative_risk : float
	This is::

	(exposed_cases/exposed_total) / (control_cases/control_total)

	exposed_cases : int
	The number of "cases" (i.e. occurrence of disease or other event
	of interest) among the sample of "exposed" individuals.
	exposed_total : int
	The total number of "exposed" individuals in the sample.
	control_cases : int
	The number of "cases" among the sample of "control" or non-exposed
	individuals.
	control_total : int
	The total number of "control" individuals in the sample.

	Methods
	-------
	confidence_interval :
	Compute the confidence interval for the relative risk estimate.
	"""

	relative_risk: float
	exposed_cases: int
	exposed_total: int
	control_cases: int
	control_total: int

	def confidence_interval(self, confidence_level=0.95):
	"""
	Compute the confidence interval for the relative risk.

	The confidence interval is computed using the Katz method
	(i.e. "Method C" of [1]_; see also [2]_, section 3.1.2).

	Parameters
	----------
	confidence_level : float, optional
	The confidence level to use for the confidence interval.
	Default is 0.95.

	Returns
	-------
	ci : ConfidenceInterval instance
	The return value is an object with attributes ``low`` and
	``high`` that hold the confidence interval.

	References
	----------
	.. [1] D. Katz, J. Baptista, S. P. Azen and M. C. Pike, "Obtaining
	confidence intervals for the risk ratio in cohort studies",
	Biometrics, 34, 469-474 (1978).
	.. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
	CRC Press LLC, Boca Raton, FL, USA (1996).


	Examples
	--------
	>>> from scipy.stats.contingency import relative_risk
	>>> result = relative_risk(exposed_cases=10, exposed_total=75,
	... control_cases=12, control_total=225)
	>>> result.relative_risk
	2.5
	>>> result.confidence_interval()
	ConfidenceInterval(low=1.1261564003469628, high=5.549850800541033)
	"""
	if not 0 <= confidence_level <= 1:
	raise ValueError('confidence_level must be in the interval '
	'[0, 1].')

	# Handle edge cases where either exposed_cases or control_cases
	# is zero. We follow the convention of the R function riskratio
	# from the epitools library.
	if self.exposed_cases == 0 and self.control_cases == 0:
	# relative risk is nan.
	return ConfidenceInterval(low=np.nan, high=np.nan)
	elif self.exposed_cases == 0:
	# relative risk is 0.
	return ConfidenceInterval(low=0.0, high=np.nan)
	elif self.control_cases == 0:
	# relative risk is inf
	return ConfidenceInterval(low=np.nan, high=np.inf)

	alpha = 1 - confidence_level
	z = ndtri(1 - alpha/2)
	rr = self.relative_risk

	# Estimate of the variance of log(rr) is
	# var(log(rr)) = 1/exposed_cases - 1/exposed_total +
	# 1/control_cases - 1/control_total
	# and the standard error is the square root of that.
	se = np.sqrt(1/self.exposed_cases - 1/self.exposed_total +
	1/self.control_cases - 1/self.control_total)
	delta = z*se
	katz_lo = rr*np.exp(-delta)
	katz_hi = rr*np.exp(delta)
	return ConfidenceInterval(low=katz_lo, high=katz_hi)


	def relative_risk(exposed_cases, exposed_total, control_cases, control_total):
	"""
	Compute the relative risk (also known as the risk ratio).

	This function computes the relative risk associated with a 2x2
	contingency table ([1]_, section 2.2.3; [2]_, section 3.1.2). Instead
	of accepting a table as an argument, the individual numbers that are
	used to compute the relative risk are given as separate parameters.
	This is to avoid the ambiguity of which row or column of the contingency
	table corresponds to the "exposed" cases and which corresponds to the
	"control" cases. Unlike, say, the odds ratio, the relative risk is not
	invariant under an interchange of the rows or columns.

	Parameters
	----------
	exposed_cases : nonnegative int
	The number of "cases" (i.e. occurrence of disease or other event
	of interest) among the sample of "exposed" individuals.
	exposed_total : positive int
	The total number of "exposed" individuals in the sample.
	control_cases : nonnegative int
	The number of "cases" among the sample of "control" or non-exposed
	individuals.
	control_total : positive int
	The total number of "control" individuals in the sample.

	Returns
	-------
	result : instance of `~scipy.stats._result_classes.RelativeRiskResult`
	The object has the float attribute ``relative_risk``, which is::

	rr = (exposed_cases/exposed_total) / (control_cases/control_total)

	The object also has the method ``confidence_interval`` to compute
	the confidence interval of the relative risk for a given confidence
	level.

	See Also
	--------
	odds_ratio

	Notes
	-----
	The R package epitools has the function `riskratio`, which accepts
	a table with the following layout::

	disease=0 disease=1
	exposed=0 (ref) n00 n01
	exposed=1 n10 n11

	With a 2x2 table in the above format, the estimate of the CI is
	computed by `riskratio` when the argument method="wald" is given,
	or with the function `riskratio.wald`.

	For example, in a test of the incidence of lung cancer among a
	sample of smokers and nonsmokers, the "exposed" category would
	correspond to "is a smoker" and the "disease" category would
	correspond to "has or had lung cancer".

	To pass the same data to ``relative_risk``, use::

	relative_risk(n11, n10 + n11, n01, n00 + n01)

	.. versionadded:: 1.7.0

	References
	----------
	.. [1] Alan Agresti, An Introduction to Categorical Data Analysis
	(second edition), Wiley, Hoboken, NJ, USA (2007).
	.. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
	CRC Press LLC, Boca Raton, FL, USA (1996).

	Examples
	--------
	>>> from scipy.stats.contingency import relative_risk

	This example is from Example 3.1 of [2]_. The results of a heart
	disease study are summarized in the following table::

	High CAT Low CAT Total
	-------- ------- -----
	CHD 27 44 71
	No CHD 95 443 538

	Total 122 487 609

	CHD is coronary heart disease, and CAT refers to the level of
	circulating catecholamine. CAT is the "exposure" variable, and
	high CAT is the "exposed" category. So the data from the table
	to be passed to ``relative_risk`` is::

	exposed_cases = 27
	exposed_total = 122
	control_cases = 44
	control_total = 487

	>>> result = relative_risk(27, 122, 44, 487)
	>>> result.relative_risk
	2.4495156482861398

	Find the confidence interval for the relative risk.

	>>> result.confidence_interval(confidence_level=0.95)
	ConfidenceInterval(low=1.5836990926700116, high=3.7886786315466354)

	The interval does not contain 1, so the data supports the statement
	that high CAT is associated with greater risk of CHD.
	"""
	# Relative risk is a trivial calculation. The nontrivial part is in the
	# `confidence_interval` method of the RelativeRiskResult class.

	exposed_cases = _validate_int(exposed_cases, 0, "exposed_cases")
	exposed_total = _validate_int(exposed_total, 1, "exposed_total")
	control_cases = _validate_int(control_cases, 0, "control_cases")
	control_total = _validate_int(control_total, 1, "control_total")

	if exposed_cases > exposed_total:
	raise ValueError('exposed_cases must not exceed exposed_total.')
	if control_cases > control_total:
	raise ValueError('control_cases must not exceed control_total.')

	if exposed_cases == 0 and control_cases == 0:
	# relative risk is 0/0.
	rr = np.nan
	elif exposed_cases == 0:
	# relative risk is 0/nonzero
	rr = 0.0
	elif control_cases == 0:
	# relative risk is nonzero/0.
	rr = np.inf
	else:
	p1 = exposed_cases / exposed_total
	p2 = control_cases / control_total
	rr = p1 / p2
	return RelativeRiskResult(relative_risk=rr,
	exposed_cases=exposed_cases,
	exposed_total=exposed_total,
	control_cases=control_cases,
	control_total=control_total)