|
import numpy as np |
|
from sklearn.base import BaseEstimator |
|
from sklearn.utils.validation import check_is_fitted |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.isotonic import IsotonicRegression |
|
from functools import lru_cache |
|
from functools import cached_property |
|
from typing import Self, Any |
|
from pickle import dump |
|
from pickle import load |
|
from abc import ABC, abstractmethod |
|
|
|
from . import ADRDModel |
|
from ..utils import Formatter |
|
from ..utils import MissingMasker |
|
|
|
|
|
def calibration_curve( |
|
y_true: list[int], |
|
y_pred: list[float], |
|
n_bins: int = 10, |
|
ratio: float = 1.0, |
|
) -> tuple[list[float], list[float]]: |
|
""" |
|
Compute true and predicted probabilities for a calibration curve. The method |
|
assumes the inputs come from a binary classifier, and discretize the [0, 1] |
|
interval into bins. |
|
|
|
Note that this function is an alternative to |
|
sklearn.calibration.calibration_curve() which can only estimate the absolute |
|
proportion of positive cases in each bin. |
|
|
|
Parameters |
|
---------- |
|
y_true : list[int] |
|
True targets. |
|
y_pred : list[float] |
|
Probabilities of the positive class. |
|
n_bins : int, default=10 |
|
Number of bins to discretize the [0, 1] interval. A bigger number |
|
requires more data. Bins with no samples (i.e. without corresponding |
|
values in y_prob) will not be returned, thus the returned arrays may |
|
have less than n_bins values. |
|
ratio : float, default=1.0 |
|
Used to adjust the class balance. |
|
|
|
Returns |
|
------- |
|
prob_true : list[float] |
|
The proportion of positive samples in each bin. |
|
prob_pred : list[float] |
|
The mean predicted probability in each bin. |
|
""" |
|
|
|
tmp = np.around(np.linspace(0, 1, n_bins + 1), decimals=6) |
|
intvs = [(tmp[i - 1], tmp[i]) for i in range(1, len(tmp))] |
|
|
|
|
|
tmp = list(zip(y_pred, y_true)) |
|
intv_pairs = {(l, r): [p for p in tmp if l <= p[0] < r] for l, r in intvs} |
|
|
|
|
|
|
|
intv_prob_true: dict[tuple, float] = dict() |
|
intv_prob_pred: dict[tuple, float] = dict() |
|
for intv, pairs in intv_pairs.items(): |
|
|
|
n_pairs = len(pairs) |
|
|
|
|
|
if n_pairs == 0: continue |
|
|
|
|
|
n_pos = sum([p[1] for p in pairs]) |
|
n_neg = n_pairs - n_pos |
|
|
|
|
|
intv_prob_true[intv] = n_pos / (n_pos + n_neg * ratio) |
|
|
|
|
|
sum_pred_pos = sum([p[0] for p in pairs if p[1] == 1]) |
|
sum_pred_neg = sum([p[0] for p in pairs if p[1] == 0]) |
|
intv_prob_pred[intv] = (sum_pred_pos + sum_pred_neg * ratio) |
|
intv_prob_pred[intv] /= (n_pos + n_neg * ratio) |
|
|
|
prob_true = list(intv_prob_true.values()) |
|
prob_pred = list(intv_prob_pred.values()) |
|
return prob_true, prob_pred |
|
|
|
|
|
class CalibrationCore(BaseEstimator): |
|
""" |
|
A wrapper class of multiple regressors to predict the proportions of |
|
positive samples from the predicted probabilities. The method for |
|
calibration can be 'sigmoid' which corresponds to Platt's method (i.e. a |
|
logistic regression model) or 'isotonic' which is a non-parametric approach. |
|
It is not advised to use isotonic calibration with too few calibration |
|
samples (<<1000) since it tends to overfit. |
|
|
|
TODO |
|
---- |
|
- 'sigmoid' method is not trivial to implement. |
|
""" |
|
def __init__(self, |
|
method: str = 'isotonic', |
|
) -> None: |
|
""" |
|
Initialization function of CalibrationCore class. |
|
|
|
Parameters |
|
---------- |
|
method : {'sigmoid', 'isotonic'}, default='isotonic' |
|
The method to use for calibration. can be 'sigmoid' which |
|
corresponds to Platt's method (i.e. a logistic regression model) or |
|
'isotonic' which is a non-parametric approach. It is not advised to |
|
use isotonic calibration with too few calibration samples (<<1000) |
|
since it tends to overfit. |
|
|
|
Raises |
|
------ |
|
ValueError |
|
Sigmoid approach has not been implemented. |
|
""" |
|
assert method in ('sigmoid', 'isotonic') |
|
if method == 'sigmoid': |
|
raise ValueError('Sigmoid approach has not been implemented.') |
|
self.method = method |
|
|
|
def fit(self, |
|
prob_pred: list[float], |
|
prob_true: list[float], |
|
) -> Self: |
|
""" |
|
Fit the underlying regressor using prob_pred, prob_true as training |
|
data. |
|
|
|
Parameters |
|
---------- |
|
prob_pred : list[float] |
|
Probabilities predicted directly by a model. |
|
prob_true : list[float] |
|
Target probabilities to calibrate to. |
|
|
|
Returns |
|
------- |
|
Self |
|
CalibrationCore object. |
|
""" |
|
|
|
if self.method == 'sigmoid': |
|
self.model_ = LogisticRegression() |
|
self.model_.fit(prob_pred, prob_true) |
|
|
|
|
|
elif self.method == 'isotonic': |
|
self.model_ = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip') |
|
self.model_.fit(prob_pred, prob_true) |
|
|
|
return self |
|
|
|
def predict(self, |
|
prob_pred: list[float], |
|
) -> list[float]: |
|
""" |
|
Calibrate the input probabilities using the fitted regressor. |
|
|
|
Parameters |
|
---------- |
|
prob_pred : list[float] |
|
Probabilities predicted directly by a model. |
|
|
|
Returns |
|
------- |
|
prob_cali : list[float] |
|
Calibrated probabilities. |
|
""" |
|
|
|
check_is_fitted(self) |
|
|
|
|
|
|
|
if self.method == 'sigmoid': |
|
prob_cali = self.model_.predict_proba(prob_pred) |
|
elif self.method == 'isotonic': |
|
prob_cali = self.model_.predict(prob_pred) |
|
|
|
return prob_cali |
|
|
|
|
|
class CalibratedClassifier(ABC): |
|
""" |
|
Abstract class of calibrated classifier. |
|
""" |
|
def __init__(self, |
|
model: ADRDModel, |
|
background_src: list[dict[str, Any]], |
|
background_tgt: list[dict[str, Any]], |
|
background_is_embedding: dict[str, bool] | None = None, |
|
method: str = 'isotonic', |
|
) -> None: |
|
""" |
|
Constructor of Calibrator class. |
|
|
|
Parameters |
|
---------- |
|
model : ADRDModel |
|
Fitted model to calibrate. |
|
background_src : list[dict[str, Any]] |
|
Features of the background dataset. |
|
background_tgt : list[dict[str, Any]] |
|
Labels of the background dataset. |
|
method : {'sigmoid', 'isotonic'}, default='isotonic' |
|
Method used by the underlying regressor. |
|
""" |
|
self.method = method |
|
self.model = model |
|
self.src_modalities = model.src_modalities |
|
self.tgt_modalities = model.tgt_modalities |
|
self.background_is_embedding = background_is_embedding |
|
|
|
|
|
fmt_src = Formatter(self.src_modalities) |
|
fmt_tgt = Formatter(self.tgt_modalities) |
|
self.background_src = [fmt_src(smp) for smp in background_src] |
|
self.background_tgt = [fmt_tgt(smp) for smp in background_tgt] |
|
|
|
@abstractmethod |
|
def predict_proba(self, |
|
src: list[dict[str, Any]], |
|
is_embedding: dict[str, bool] | None = None, |
|
) -> list[dict[str, float]]: |
|
""" |
|
This method returns calibrated probabilities of classification. |
|
|
|
Parameters |
|
---------- |
|
src : list[dict[str, Any]] |
|
Features of the input samples. |
|
|
|
Returns |
|
------- |
|
list[dict[str, float]] |
|
Calibrated probabilities. |
|
""" |
|
pass |
|
|
|
def predict(self, |
|
src: list[dict[str, Any]], |
|
is_embedding: dict[str, bool] | None = None, |
|
) -> list[dict[str, int]]: |
|
""" |
|
Make predictions based on the results of predict_proba(). |
|
|
|
Parameters |
|
---------- |
|
x : list[dict[str, Any]] |
|
Input features. |
|
|
|
Returns |
|
------- |
|
list[dict[str, int]] |
|
Calibrated predictions. |
|
""" |
|
proba = self.predict_proba(src, is_embedding) |
|
return [{k: int(smp[k] > 0.5) for k in self.tgt_modalities} for smp in proba] |
|
|
|
def save(self, |
|
filepath_state_dict: str, |
|
) -> None: |
|
""" |
|
Save the state dict and the underlying model to the given paths. |
|
|
|
Parameters |
|
---------- |
|
filepath_state_dict : str |
|
File path to save the state_dict which includes the background |
|
dataset and the regressor information. |
|
filepath_wrapped_model : str | None, default=None |
|
File path to save the wrapped model. If None, the model won't be |
|
saved. |
|
""" |
|
|
|
state_dict = { |
|
'background_src': self.background_src, |
|
'background_tgt': self.background_tgt, |
|
'background_is_embedding': self.background_is_embedding, |
|
'method': self.method, |
|
} |
|
with open(filepath_state_dict, 'wb') as f: |
|
dump(state_dict, f) |
|
|
|
@classmethod |
|
def from_ckpt(cls, |
|
filepath_state_dict: str, |
|
filepath_wrapped_model: str, |
|
) -> Self: |
|
""" |
|
Alternative constructor which loads from checkpoint. |
|
|
|
Parameters |
|
---------- |
|
filepath_state_dict : str |
|
File path to load the state_dict which includes the background |
|
dataset and the regressor information. |
|
filepath_wrapped_model : str |
|
File path of the wrapped model. |
|
|
|
Returns |
|
------- |
|
Self |
|
CalibratedClassifier class object. |
|
""" |
|
with open(filepath_state_dict, 'rb') as f: |
|
kwargs = load(f) |
|
kwargs['model'] = ADRDModel.from_ckpt(filepath_wrapped_model) |
|
return cls(**kwargs) |
|
|
|
|
|
class DynamicCalibratedClassifier(CalibratedClassifier): |
|
""" |
|
The dynamic approach generates background predictions based on the |
|
missingness pattern of each input. With an astronomical number of |
|
missingness patterns, calibrating each sample requires a comprehensive |
|
process that involves running the ADRDModel on the majority of the |
|
background data and training a corresponding regressor. This results in a |
|
computationally intensive calculation. |
|
""" |
|
def predict_proba(self, |
|
src: list[dict[str, Any]], |
|
is_embedding: dict[str, bool] | None = None, |
|
) -> list[dict[str, float]]: |
|
|
|
|
|
msk_gen = MissingMasker(self.src_modalities) |
|
fmt_src = Formatter(self.src_modalities) |
|
src = [fmt_src(smp) for smp in src] |
|
|
|
|
|
calibrated_prob: list[dict[str, float]] = [] |
|
for smp in src: |
|
|
|
prob = self.model.predict_proba([smp], is_embedding)[0] |
|
mask = tuple(msk_gen(smp).values()) |
|
|
|
|
|
core = self._fit_core(mask) |
|
calibrated_prob.append({k: core[k].predict([prob[k]])[0] for k in self.tgt_modalities}) |
|
|
|
return calibrated_prob |
|
|
|
|
|
def _fit_core(self, |
|
missingness_pattern: tuple[bool], |
|
) -> dict[str, CalibrationCore]: |
|
''' ... ''' |
|
|
|
background_src, background_tgt = [], [] |
|
for src, tgt in zip(self.background_src, self.background_tgt): |
|
src = {k: v for j, (k, v) in enumerate(src.items()) if missingness_pattern[j] == False} |
|
|
|
|
|
if len([v is not None for v in src.values()]) == 0: continue |
|
background_src.append(src) |
|
background_tgt.append(tgt) |
|
|
|
|
|
background_prob = self.model.predict_proba(background_src, self.background_is_embedding, _batch_size=1024) |
|
|
|
|
|
N = len(background_src) |
|
background_prob = {k: [background_prob[i][k] for i in range(N)] for k in self.tgt_modalities} |
|
background_true = {k: [background_tgt[i][k] for i in range(N)] for k in self.tgt_modalities} |
|
|
|
|
|
core: dict[str, CalibrationCore] = dict() |
|
for k in self.tgt_modalities: |
|
prob_true, prob_pred = calibration_curve( |
|
background_true[k], background_prob[k], |
|
ratio = self.background_ratio[k], |
|
) |
|
core[k] = CalibrationCore(self.method).fit(prob_pred, prob_true) |
|
|
|
return core |
|
|
|
@cached_property |
|
def background_ratio(self) -> dict[str, float]: |
|
''' The ratio of positives over negatives in the background dataset. ''' |
|
return {k: self.background_n_pos[k] / self.background_n_neg[k] for k in self.tgt_modalities} |
|
|
|
@cached_property |
|
def background_n_pos(self) -> dict[str, int]: |
|
''' Number of positives w.r.t each target in the background dataset. ''' |
|
return {k: sum([d[k] for d in self.background_tgt]) for k in self.tgt_modalities} |
|
|
|
@cached_property |
|
def background_n_neg(self) -> dict[str, int]: |
|
''' Number of negatives w.r.t each target in the background dataset. ''' |
|
return {k: len(self.background_tgt) - self.background_n_pos[k] for k in self.tgt_modalities} |
|
|
|
|
|
class StaticCalibratedClassifier(CalibratedClassifier): |
|
""" |
|
The static approach generates background predictions without considering the |
|
missingness patterns. |
|
""" |
|
def predict_proba(self, |
|
src: list[dict[str, Any]], |
|
is_embedding: dict[str, bool] | None = None, |
|
) -> list[dict[str, float]]: |
|
|
|
|
|
N = len(src) |
|
|
|
|
|
fmt_src = Formatter(self.src_modalities) |
|
src = [fmt_src(smp) for smp in src] |
|
prob = self.model.predict_proba(src, is_embedding) |
|
prob = {k: [prob[i][k] for i in range(N)] for k in self.tgt_modalities} |
|
|
|
|
|
core = self._fit_core() |
|
calibrated_prob = {k: core[k].predict(prob[k]) for k in self.tgt_modalities} |
|
|
|
|
|
calibrated_prob: list[dict[str, float]] = [ |
|
{k: calibrated_prob[k][i] for k in self.tgt_modalities} for i in range(N) |
|
] |
|
return calibrated_prob |
|
|
|
@lru_cache(maxsize = None) |
|
def _fit_core(self) -> dict[str, CalibrationCore]: |
|
''' ... ''' |
|
|
|
background_prob = self.model.predict_proba(self.background_src, self.background_is_embedding, _batch_size=1024) |
|
|
|
|
|
N = len(self.background_src) |
|
background_prob = {k: [background_prob[i][k] for i in range(N)] for k in self.tgt_modalities} |
|
background_true = {k: [self.background_tgt[i][k] for i in range(N)] for k in self.tgt_modalities} |
|
|
|
|
|
core: dict[str, CalibrationCore] = dict() |
|
for k in self.tgt_modalities: |
|
prob_true, prob_pred = calibration_curve( |
|
background_true[k], background_prob[k], |
|
ratio = 1.0, |
|
) |
|
core[k] = CalibrationCore(self.method).fit(prob_pred, prob_true) |
|
|
|
return core |