nmed2024 / adrd /model /calibration.py
xf3227's picture
ok
6fc43ab
raw
history blame
16.1 kB
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn.isotonic import IsotonicRegression
from functools import lru_cache
from functools import cached_property
from typing import Self, Any
from pickle import dump
from pickle import load
from abc import ABC, abstractmethod
from . import ADRDModel
from ..utils import Formatter
from ..utils import MissingMasker
def calibration_curve(
y_true: list[int],
y_pred: list[float],
n_bins: int = 10,
ratio: float = 1.0,
) -> tuple[list[float], list[float]]:
"""
Compute true and predicted probabilities for a calibration curve. The method
assumes the inputs come from a binary classifier, and discretize the [0, 1]
interval into bins.
Note that this function is an alternative to
sklearn.calibration.calibration_curve() which can only estimate the absolute
proportion of positive cases in each bin.
Parameters
----------
y_true : list[int]
True targets.
y_pred : list[float]
Probabilities of the positive class.
n_bins : int, default=10
Number of bins to discretize the [0, 1] interval. A bigger number
requires more data. Bins with no samples (i.e. without corresponding
values in y_prob) will not be returned, thus the returned arrays may
have less than n_bins values.
ratio : float, default=1.0
Used to adjust the class balance.
Returns
-------
prob_true : list[float]
The proportion of positive samples in each bin.
prob_pred : list[float]
The mean predicted probability in each bin.
"""
# generate "n_bin" intervals
tmp = np.around(np.linspace(0, 1, n_bins + 1), decimals=6)
intvs = [(tmp[i - 1], tmp[i]) for i in range(1, len(tmp))]
# pair up (pred, true) and group them by intervals
tmp = list(zip(y_pred, y_true))
intv_pairs = {(l, r): [p for p in tmp if l <= p[0] < r] for l, r in intvs}
# calculate balanced proportion of POSITIVE cases for each intervel
# along with the balanced averaged predictions
intv_prob_true: dict[tuple, float] = dict()
intv_prob_pred: dict[tuple, float] = dict()
for intv, pairs in intv_pairs.items():
# number of cases that fall into the interval
n_pairs = len(pairs)
# it's likely that no predictions fall into the interval
if n_pairs == 0: continue
# count number of positives and negatives in the interval
n_pos = sum([p[1] for p in pairs])
n_neg = n_pairs - n_pos
# calculate adjusted proportion of positives
intv_prob_true[intv] = n_pos / (n_pos + n_neg * ratio)
# calculate adjusted avg. predictions
sum_pred_pos = sum([p[0] for p in pairs if p[1] == 1])
sum_pred_neg = sum([p[0] for p in pairs if p[1] == 0])
intv_prob_pred[intv] = (sum_pred_pos + sum_pred_neg * ratio)
intv_prob_pred[intv] /= (n_pos + n_neg * ratio)
prob_true = list(intv_prob_true.values())
prob_pred = list(intv_prob_pred.values())
return prob_true, prob_pred
class CalibrationCore(BaseEstimator):
"""
A wrapper class of multiple regressors to predict the proportions of
positive samples from the predicted probabilities. The method for
calibration can be 'sigmoid' which corresponds to Platt's method (i.e. a
logistic regression model) or 'isotonic' which is a non-parametric approach.
It is not advised to use isotonic calibration with too few calibration
samples (<<1000) since it tends to overfit.
TODO
----
- 'sigmoid' method is not trivial to implement.
"""
def __init__(self,
method: str = 'isotonic',
) -> None:
"""
Initialization function of CalibrationCore class.
Parameters
----------
method : {'sigmoid', 'isotonic'}, default='isotonic'
The method to use for calibration. can be 'sigmoid' which
corresponds to Platt's method (i.e. a logistic regression model) or
'isotonic' which is a non-parametric approach. It is not advised to
use isotonic calibration with too few calibration samples (<<1000)
since it tends to overfit.
Raises
------
ValueError
Sigmoid approach has not been implemented.
"""
assert method in ('sigmoid', 'isotonic')
if method == 'sigmoid':
raise ValueError('Sigmoid approach has not been implemented.')
self.method = method
def fit(self,
prob_pred: list[float],
prob_true: list[float],
) -> Self:
"""
Fit the underlying regressor using prob_pred, prob_true as training
data.
Parameters
----------
prob_pred : list[float]
Probabilities predicted directly by a model.
prob_true : list[float]
Target probabilities to calibrate to.
Returns
-------
Self
CalibrationCore object.
"""
# using Platt's method for calibration
if self.method == 'sigmoid':
self.model_ = LogisticRegression()
self.model_.fit(prob_pred, prob_true)
# using isotonic calibration
elif self.method == 'isotonic':
self.model_ = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip')
self.model_.fit(prob_pred, prob_true)
return self
def predict(self,
prob_pred: list[float],
) -> list[float]:
"""
Calibrate the input probabilities using the fitted regressor.
Parameters
----------
prob_pred : list[float]
Probabilities predicted directly by a model.
Returns
-------
prob_cali : list[float]
Calibrated probabilities.
"""
# as usual, the core needs to be fitted
check_is_fitted(self)
# note that logistic regression is classification model, we need to call
# 'predict_proba' instead of 'predict' to get the calibrated results
if self.method == 'sigmoid':
prob_cali = self.model_.predict_proba(prob_pred)
elif self.method == 'isotonic':
prob_cali = self.model_.predict(prob_pred)
return prob_cali
class CalibratedClassifier(ABC):
"""
Abstract class of calibrated classifier.
"""
def __init__(self,
model: ADRDModel,
background_src: list[dict[str, Any]],
background_tgt: list[dict[str, Any]],
background_is_embedding: dict[str, bool] | None = None,
method: str = 'isotonic',
) -> None:
"""
Constructor of Calibrator class.
Parameters
----------
model : ADRDModel
Fitted model to calibrate.
background_src : list[dict[str, Any]]
Features of the background dataset.
background_tgt : list[dict[str, Any]]
Labels of the background dataset.
method : {'sigmoid', 'isotonic'}, default='isotonic'
Method used by the underlying regressor.
"""
self.method = method
self.model = model
self.src_modalities = model.src_modalities
self.tgt_modalities = model.tgt_modalities
self.background_is_embedding = background_is_embedding
# format background data
fmt_src = Formatter(self.src_modalities)
fmt_tgt = Formatter(self.tgt_modalities)
self.background_src = [fmt_src(smp) for smp in background_src]
self.background_tgt = [fmt_tgt(smp) for smp in background_tgt]
@abstractmethod
def predict_proba(self,
src: list[dict[str, Any]],
is_embedding: dict[str, bool] | None = None,
) -> list[dict[str, float]]:
"""
This method returns calibrated probabilities of classification.
Parameters
----------
src : list[dict[str, Any]]
Features of the input samples.
Returns
-------
list[dict[str, float]]
Calibrated probabilities.
"""
pass
def predict(self,
src: list[dict[str, Any]],
is_embedding: dict[str, bool] | None = None,
) -> list[dict[str, int]]:
"""
Make predictions based on the results of predict_proba().
Parameters
----------
x : list[dict[str, Any]]
Input features.
Returns
-------
list[dict[str, int]]
Calibrated predictions.
"""
proba = self.predict_proba(src, is_embedding)
return [{k: int(smp[k] > 0.5) for k in self.tgt_modalities} for smp in proba]
def save(self,
filepath_state_dict: str,
) -> None:
"""
Save the state dict and the underlying model to the given paths.
Parameters
----------
filepath_state_dict : str
File path to save the state_dict which includes the background
dataset and the regressor information.
filepath_wrapped_model : str | None, default=None
File path to save the wrapped model. If None, the model won't be
saved.
"""
# save state dict
state_dict = {
'background_src': self.background_src,
'background_tgt': self.background_tgt,
'background_is_embedding': self.background_is_embedding,
'method': self.method,
}
with open(filepath_state_dict, 'wb') as f:
dump(state_dict, f)
@classmethod
def from_ckpt(cls,
filepath_state_dict: str,
filepath_wrapped_model: str,
) -> Self:
"""
Alternative constructor which loads from checkpoint.
Parameters
----------
filepath_state_dict : str
File path to load the state_dict which includes the background
dataset and the regressor information.
filepath_wrapped_model : str
File path of the wrapped model.
Returns
-------
Self
CalibratedClassifier class object.
"""
with open(filepath_state_dict, 'rb') as f:
kwargs = load(f)
kwargs['model'] = ADRDModel.from_ckpt(filepath_wrapped_model)
return cls(**kwargs)
class DynamicCalibratedClassifier(CalibratedClassifier):
"""
The dynamic approach generates background predictions based on the
missingness pattern of each input. With an astronomical number of
missingness patterns, calibrating each sample requires a comprehensive
process that involves running the ADRDModel on the majority of the
background data and training a corresponding regressor. This results in a
computationally intensive calculation.
"""
def predict_proba(self,
src: list[dict[str, Any]],
is_embedding: dict[str, bool] | None = None,
) -> list[dict[str, float]]:
# initialize mask generator and format inputs
msk_gen = MissingMasker(self.src_modalities)
fmt_src = Formatter(self.src_modalities)
src = [fmt_src(smp) for smp in src]
# calculate calibrated probabilities
calibrated_prob: list[dict[str, float]] = []
for smp in src:
# model output and missingness pattern
prob = self.model.predict_proba([smp], is_embedding)[0]
mask = tuple(msk_gen(smp).values())
# get/fit core and calculate calibrated probabilities
core = self._fit_core(mask)
calibrated_prob.append({k: core[k].predict([prob[k]])[0] for k in self.tgt_modalities})
return calibrated_prob
# @lru_cache(maxsize = None)
def _fit_core(self,
missingness_pattern: tuple[bool],
) -> dict[str, CalibrationCore]:
''' ... '''
# remove features from all background samples accordingly
background_src, background_tgt = [], []
for src, tgt in zip(self.background_src, self.background_tgt):
src = {k: v for j, (k, v) in enumerate(src.items()) if missingness_pattern[j] == False}
# make sure there is at least one feature available
if len([v is not None for v in src.values()]) == 0: continue
background_src.append(src)
background_tgt.append(tgt)
# run model on background samples and collection predictions
background_prob = self.model.predict_proba(background_src, self.background_is_embedding, _batch_size=1024)
# list[dict] -> dict[list]
N = len(background_src)
background_prob = {k: [background_prob[i][k] for i in range(N)] for k in self.tgt_modalities}
background_true = {k: [background_tgt[i][k] for i in range(N)] for k in self.tgt_modalities}
# now, fit cores
core: dict[str, CalibrationCore] = dict()
for k in self.tgt_modalities:
prob_true, prob_pred = calibration_curve(
background_true[k], background_prob[k],
ratio = self.background_ratio[k],
)
core[k] = CalibrationCore(self.method).fit(prob_pred, prob_true)
return core
@cached_property
def background_ratio(self) -> dict[str, float]:
''' The ratio of positives over negatives in the background dataset. '''
return {k: self.background_n_pos[k] / self.background_n_neg[k] for k in self.tgt_modalities}
@cached_property
def background_n_pos(self) -> dict[str, int]:
''' Number of positives w.r.t each target in the background dataset. '''
return {k: sum([d[k] for d in self.background_tgt]) for k in self.tgt_modalities}
@cached_property
def background_n_neg(self) -> dict[str, int]:
''' Number of negatives w.r.t each target in the background dataset. '''
return {k: len(self.background_tgt) - self.background_n_pos[k] for k in self.tgt_modalities}
class StaticCalibratedClassifier(CalibratedClassifier):
"""
The static approach generates background predictions without considering the
missingness patterns.
"""
def predict_proba(self,
src: list[dict[str, Any]],
is_embedding: dict[str, bool] | None = None,
) -> list[dict[str, float]]:
# number of input samples
N = len(src)
# format inputs, and run ADRDModel, and convert to dict[list]
fmt_src = Formatter(self.src_modalities)
src = [fmt_src(smp) for smp in src]
prob = self.model.predict_proba(src, is_embedding)
prob = {k: [prob[i][k] for i in range(N)] for k in self.tgt_modalities}
# calibrate probabilities
core = self._fit_core()
calibrated_prob = {k: core[k].predict(prob[k]) for k in self.tgt_modalities}
# convert back to list[dict]
calibrated_prob: list[dict[str, float]] = [
{k: calibrated_prob[k][i] for k in self.tgt_modalities} for i in range(N)
]
return calibrated_prob
@lru_cache(maxsize = None)
def _fit_core(self) -> dict[str, CalibrationCore]:
''' ... '''
# run model on background samples and collection predictions
background_prob = self.model.predict_proba(self.background_src, self.background_is_embedding, _batch_size=1024)
# list[dict] -> dict[list]
N = len(self.background_src)
background_prob = {k: [background_prob[i][k] for i in range(N)] for k in self.tgt_modalities}
background_true = {k: [self.background_tgt[i][k] for i in range(N)] for k in self.tgt_modalities}
# now, fit cores
core: dict[str, CalibrationCore] = dict()
for k in self.tgt_modalities:
prob_true, prob_pred = calibration_curve(
background_true[k], background_prob[k],
ratio = 1.0,
)
core[k] = CalibrationCore(self.method).fit(prob_pred, prob_true)
return core