Spaces:

vkola-lab
/

nmed2024

Running

nmed2024

File size: 16,115 Bytes

6fc43ab

import numpy as np
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn.isotonic import IsotonicRegression
from functools import lru_cache
from functools import cached_property
from typing import Self, Any
from pickle import dump
from pickle import load
from abc import ABC, abstractmethod

from . import ADRDModel
from ..utils import Formatter
from ..utils import MissingMasker


def calibration_curve(
    y_true: list[int],
    y_pred: list[float],
    n_bins: int = 10,
    ratio: float = 1.0,
) -> tuple[list[float], list[float]]:
    """
    Compute true and predicted probabilities for a calibration curve. The method
    assumes the inputs come from a binary classifier, and discretize the [0, 1] 
    interval into bins.

    Note that this function is an alternative to
    sklearn.calibration.calibration_curve() which can only estimate the absolute
    proportion of positive cases in each bin.

    Parameters
    ----------
    y_true : list[int]
        True targets.
    y_pred : list[float]
        Probabilities of the positive class.
    n_bins : int, default=10
        Number of bins to discretize the [0, 1] interval. A bigger number
        requires more data. Bins with no samples (i.e. without corresponding
        values in y_prob) will not be returned, thus the returned arrays may
        have less than n_bins values.
    ratio : float, default=1.0
        Used to adjust the class balance.

    Returns
    -------
    prob_true : list[float]
        The proportion of positive samples in each bin.
    prob_pred : list[float]
        The mean predicted probability in each bin.
    """
    # generate "n_bin" intervals
    tmp = np.around(np.linspace(0, 1, n_bins + 1), decimals=6)
    intvs = [(tmp[i - 1], tmp[i]) for i in range(1, len(tmp))]
    
    # pair up (pred, true) and group them by intervals
    tmp = list(zip(y_pred, y_true))
    intv_pairs = {(l, r): [p for p in tmp if l <= p[0] < r] for l, r in intvs}

    # calculate balanced proportion of POSITIVE cases for each intervel
    # along with the balanced averaged predictions
    intv_prob_true: dict[tuple, float] = dict()
    intv_prob_pred: dict[tuple, float] = dict()
    for intv, pairs in intv_pairs.items():
        # number of cases that fall into the interval
        n_pairs = len(pairs)

        # it's likely that no predictions fall into the interval
        if n_pairs == 0: continue

        # count number of positives and negatives in the interval
        n_pos = sum([p[1] for p in pairs])
        n_neg = n_pairs - n_pos

        # calculate adjusted proportion of positives
        intv_prob_true[intv] = n_pos / (n_pos + n_neg * ratio)

        # calculate adjusted avg. predictions
        sum_pred_pos = sum([p[0] for p in pairs if p[1] == 1])
        sum_pred_neg = sum([p[0] for p in pairs if p[1] == 0])
        intv_prob_pred[intv] = (sum_pred_pos + sum_pred_neg * ratio)
        intv_prob_pred[intv] /= (n_pos + n_neg * ratio)

    prob_true = list(intv_prob_true.values())
    prob_pred = list(intv_prob_pred.values())
    return prob_true, prob_pred


class CalibrationCore(BaseEstimator):
    """
    A wrapper class of multiple regressors to predict the proportions of
    positive samples from the predicted probabilities. The method for
    calibration can be 'sigmoid' which corresponds to Platt's method (i.e. a 
    logistic regression model) or 'isotonic' which is a non-parametric approach.
    It is not advised to use isotonic calibration with too few calibration
    samples (<<1000) since it tends to overfit.

    TODO
    ----
    - 'sigmoid' method is not trivial to implement.
    """
    def __init__(self, 
        method: str = 'isotonic',
    ) -> None:
        """
        Initialization function of CalibrationCore class.

        Parameters
        ----------
        method : {'sigmoid', 'isotonic'}, default='isotonic'
            The method to use for calibration. can be 'sigmoid' which
            corresponds to Platt's method (i.e. a logistic regression model) or
            'isotonic' which is a non-parametric approach. It is not advised to
            use isotonic calibration with too few calibration samples (<<1000)
            since it tends to overfit.

        Raises
        ------
        ValueError
            Sigmoid approach has not been implemented.
        """        
        assert method in ('sigmoid', 'isotonic')
        if method == 'sigmoid':
            raise ValueError('Sigmoid approach has not been implemented.')
        self.method = method

    def fit(self, 
        prob_pred: list[float], 
        prob_true: list[float],
    ) -> Self:
        """
        Fit the underlying regressor using prob_pred, prob_true as training
        data.

        Parameters
        ----------
        prob_pred : list[float]
            Probabilities predicted directly by a model.
        prob_true : list[float]
            Target probabilities to calibrate to.

        Returns
        -------
        Self
            CalibrationCore object.
        """              
        # using Platt's method for calibration
        if self.method == 'sigmoid':
            self.model_ = LogisticRegression()
            self.model_.fit(prob_pred, prob_true)

        # using isotonic calibration
        elif self.method == 'isotonic':
            self.model_ = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip')
            self.model_.fit(prob_pred, prob_true)

        return self

    def predict(self,
        prob_pred: list[float],
    ) -> list[float]:
        """
        Calibrate the input probabilities using the fitted regressor.

        Parameters
        ----------
        prob_pred : list[float]
            Probabilities predicted directly by a model.

        Returns
        -------
        prob_cali : list[float]
            Calibrated probabilities.
        """        
        # as usual, the core needs to be fitted
        check_is_fitted(self)

        # note that logistic regression is classification model, we need to call
        # 'predict_proba' instead of 'predict' to get the calibrated results
        if self.method == 'sigmoid':
            prob_cali = self.model_.predict_proba(prob_pred)
        elif self.method == 'isotonic':
            prob_cali = self.model_.predict(prob_pred)

        return prob_cali
    

class CalibratedClassifier(ABC):
    """
    Abstract class of calibrated classifier.
    """
    def __init__(self, 
        model: ADRDModel,
        background_src: list[dict[str, Any]],
        background_tgt: list[dict[str, Any]],
        background_is_embedding: dict[str, bool] | None = None,
        method: str = 'isotonic',
    ) -> None:
        """
        Constructor of Calibrator class.

        Parameters
        ----------
        model : ADRDModel
            Fitted model to calibrate.
        background_src : list[dict[str, Any]]
            Features of the background dataset.
        background_tgt : list[dict[str, Any]]
            Labels of the background dataset.
        method : {'sigmoid', 'isotonic'}, default='isotonic'
            Method used by the underlying regressor. 
        """
        self.method = method
        self.model = model
        self.src_modalities = model.src_modalities
        self.tgt_modalities = model.tgt_modalities
        self.background_is_embedding = background_is_embedding

        # format background data
        fmt_src = Formatter(self.src_modalities)
        fmt_tgt = Formatter(self.tgt_modalities)
        self.background_src = [fmt_src(smp) for smp in background_src]
        self.background_tgt = [fmt_tgt(smp) for smp in background_tgt]
    
    @abstractmethod
    def predict_proba(self, 
        src: list[dict[str, Any]],
        is_embedding: dict[str, bool] | None = None,
    ) -> list[dict[str, float]]:
        """
        This method returns calibrated probabilities of classification.

        Parameters
        ----------
        src : list[dict[str, Any]]
            Features of the input samples.

        Returns
        -------
        list[dict[str, float]]
            Calibrated probabilities.
        """ 
        pass

    def predict(self,
        src: list[dict[str, Any]],
        is_embedding: dict[str, bool] | None = None,
    ) -> list[dict[str, int]]:
        """
        Make predictions based on the results of predict_proba().

        Parameters
        ----------
        x : list[dict[str, Any]]
            Input features.

        Returns
        -------
        list[dict[str, int]]
            Calibrated predictions.
        """
        proba = self.predict_proba(src, is_embedding)
        return [{k: int(smp[k] > 0.5) for k in self.tgt_modalities} for smp in proba]

    def save(self,
        filepath_state_dict: str,
    ) -> None:
        """
        Save the state dict and the underlying model to the given paths.

        Parameters
        ----------
        filepath_state_dict : str
            File path to save the state_dict which includes the background
            dataset and the regressor information.
        filepath_wrapped_model : str | None, default=None
            File path to save the wrapped model. If None, the model won't be
            saved. 
        """
        # save state dict
        state_dict = {
            'background_src': self.background_src,
            'background_tgt': self.background_tgt,
            'background_is_embedding': self.background_is_embedding,
            'method': self.method,
        }
        with open(filepath_state_dict, 'wb') as f:
            dump(state_dict, f)

    @classmethod
    def from_ckpt(cls,
        filepath_state_dict: str,
        filepath_wrapped_model: str,
    ) -> Self:
        """
        Alternative constructor which loads from checkpoint.

        Parameters
        ----------
        filepath_state_dict : str
            File path to load the state_dict which includes the background
            dataset and the regressor information.
        filepath_wrapped_model : str
            File path of the wrapped model.

        Returns
        -------
        Self
            CalibratedClassifier class object.
        """
        with open(filepath_state_dict, 'rb') as f:
            kwargs = load(f)
        kwargs['model'] = ADRDModel.from_ckpt(filepath_wrapped_model)
        return cls(**kwargs)


class DynamicCalibratedClassifier(CalibratedClassifier):
    """
    The dynamic approach generates background predictions based on the
    missingness pattern of each input. With an astronomical number of
    missingness patterns, calibrating each sample requires a comprehensive
    process that involves running the ADRDModel on the majority of the
    background data and training a corresponding regressor. This results in a
    computationally intensive calculation.
    """
    def predict_proba(self,
        src: list[dict[str, Any]],
        is_embedding: dict[str, bool] | None = None,
    ) -> list[dict[str, float]]:
        
        # initialize mask generator and format inputs
        msk_gen = MissingMasker(self.src_modalities)
        fmt_src = Formatter(self.src_modalities)
        src = [fmt_src(smp) for smp in src]

        # calculate calibrated probabilities
        calibrated_prob: list[dict[str, float]] = []
        for smp in src:
            # model output and missingness pattern
            prob = self.model.predict_proba([smp], is_embedding)[0]
            mask = tuple(msk_gen(smp).values())

            # get/fit core and calculate calibrated probabilities
            core = self._fit_core(mask)
            calibrated_prob.append({k: core[k].predict([prob[k]])[0] for k in self.tgt_modalities})

        return calibrated_prob
    
    # @lru_cache(maxsize = None)
    def _fit_core(self,
        missingness_pattern: tuple[bool],
    ) -> dict[str, CalibrationCore]:
        ''' ... ''' 
        # remove features from all background samples accordingly
        background_src, background_tgt = [], []
        for src, tgt in zip(self.background_src, self.background_tgt):
            src = {k: v for j, (k, v) in enumerate(src.items()) if missingness_pattern[j] == False}

            # make sure there is at least one feature available
            if len([v is not None for v in src.values()]) == 0: continue
            background_src.append(src)
            background_tgt.append(tgt)

        # run model on background samples and collection predictions
        background_prob = self.model.predict_proba(background_src, self.background_is_embedding, _batch_size=1024)

        # list[dict] -> dict[list]
        N = len(background_src)
        background_prob = {k: [background_prob[i][k] for i in range(N)] for k in self.tgt_modalities}
        background_true = {k: [background_tgt[i][k] for i in range(N)] for k in self.tgt_modalities}

        # now, fit cores
        core: dict[str, CalibrationCore] = dict()
        for k in self.tgt_modalities:
            prob_true, prob_pred = calibration_curve(
                background_true[k], background_prob[k],
                ratio = self.background_ratio[k],
            )
            core[k] = CalibrationCore(self.method).fit(prob_pred, prob_true)
        
        return core
    
    @cached_property
    def background_ratio(self) -> dict[str, float]:
        ''' The ratio of positives over negatives in the background dataset. '''
        return {k: self.background_n_pos[k] / self.background_n_neg[k] for k in self.tgt_modalities}

    @cached_property
    def background_n_pos(self) -> dict[str, int]:
        ''' Number of positives w.r.t each target in the background dataset. '''
        return {k: sum([d[k] for d in self.background_tgt]) for k in self.tgt_modalities}

    @cached_property
    def background_n_neg(self) -> dict[str, int]:
        ''' Number of negatives w.r.t each target in the background dataset. '''
        return {k: len(self.background_tgt) - self.background_n_pos[k] for k in self.tgt_modalities}


class StaticCalibratedClassifier(CalibratedClassifier):
    """
    The static approach generates background predictions without considering the
    missingness patterns.
    """
    def predict_proba(self,
        src: list[dict[str, Any]],
        is_embedding: dict[str, bool] | None = None,
    ) -> list[dict[str, float]]:

        # number of input samples
        N = len(src)

        # format inputs, and run ADRDModel, and convert to dict[list]
        fmt_src = Formatter(self.src_modalities)
        src = [fmt_src(smp) for smp in src]
        prob = self.model.predict_proba(src, is_embedding)
        prob = {k: [prob[i][k] for i in range(N)] for k in self.tgt_modalities}

        # calibrate probabilities
        core = self._fit_core()
        calibrated_prob = {k: core[k].predict(prob[k]) for k in self.tgt_modalities}

        # convert back to list[dict]
        calibrated_prob: list[dict[str, float]] = [
            {k: calibrated_prob[k][i] for k in self.tgt_modalities} for i in range(N)
        ]
        return calibrated_prob
    
    @lru_cache(maxsize = None)
    def _fit_core(self) -> dict[str, CalibrationCore]:
        ''' ... '''
        # run model on background samples and collection predictions
        background_prob = self.model.predict_proba(self.background_src, self.background_is_embedding, _batch_size=1024)

        # list[dict] -> dict[list]
        N = len(self.background_src)
        background_prob = {k: [background_prob[i][k] for i in range(N)] for k in self.tgt_modalities}
        background_true = {k: [self.background_tgt[i][k] for i in range(N)] for k in self.tgt_modalities}

        # now, fit cores
        core: dict[str, CalibrationCore] = dict()
        for k in self.tgt_modalities:
            prob_true, prob_pred = calibration_curve(
                background_true[k], background_prob[k],
                ratio = 1.0,
            )
            core[k] = CalibrationCore(self.method).fit(prob_pred, prob_true)
        
        return core