File size: 16,115 Bytes
6fc43ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn.isotonic import IsotonicRegression
from functools import lru_cache
from functools import cached_property
from typing import Self, Any
from pickle import dump
from pickle import load
from abc import ABC, abstractmethod

from . import ADRDModel
from ..utils import Formatter
from ..utils import MissingMasker


def calibration_curve(
    y_true: list[int],
    y_pred: list[float],
    n_bins: int = 10,
    ratio: float = 1.0,
) -> tuple[list[float], list[float]]:
    """
    Compute true and predicted probabilities for a calibration curve. The method
    assumes the inputs come from a binary classifier, and discretize the [0, 1] 
    interval into bins.

    Note that this function is an alternative to
    sklearn.calibration.calibration_curve() which can only estimate the absolute
    proportion of positive cases in each bin.

    Parameters
    ----------
    y_true : list[int]
        True targets.
    y_pred : list[float]
        Probabilities of the positive class.
    n_bins : int, default=10
        Number of bins to discretize the [0, 1] interval. A bigger number
        requires more data. Bins with no samples (i.e. without corresponding
        values in y_prob) will not be returned, thus the returned arrays may
        have less than n_bins values.
    ratio : float, default=1.0
        Used to adjust the class balance.

    Returns
    -------
    prob_true : list[float]
        The proportion of positive samples in each bin.
    prob_pred : list[float]
        The mean predicted probability in each bin.
    """
    # generate "n_bin" intervals
    tmp = np.around(np.linspace(0, 1, n_bins + 1), decimals=6)
    intvs = [(tmp[i - 1], tmp[i]) for i in range(1, len(tmp))]
    
    # pair up (pred, true) and group them by intervals
    tmp = list(zip(y_pred, y_true))
    intv_pairs = {(l, r): [p for p in tmp if l <= p[0] < r] for l, r in intvs}

    # calculate balanced proportion of POSITIVE cases for each intervel
    # along with the balanced averaged predictions
    intv_prob_true: dict[tuple, float] = dict()
    intv_prob_pred: dict[tuple, float] = dict()
    for intv, pairs in intv_pairs.items():
        # number of cases that fall into the interval
        n_pairs = len(pairs)

        # it's likely that no predictions fall into the interval
        if n_pairs == 0: continue

        # count number of positives and negatives in the interval
        n_pos = sum([p[1] for p in pairs])
        n_neg = n_pairs - n_pos

        # calculate adjusted proportion of positives
        intv_prob_true[intv] = n_pos / (n_pos + n_neg * ratio)

        # calculate adjusted avg. predictions
        sum_pred_pos = sum([p[0] for p in pairs if p[1] == 1])
        sum_pred_neg = sum([p[0] for p in pairs if p[1] == 0])
        intv_prob_pred[intv] = (sum_pred_pos + sum_pred_neg * ratio)
        intv_prob_pred[intv] /= (n_pos + n_neg * ratio)

    prob_true = list(intv_prob_true.values())
    prob_pred = list(intv_prob_pred.values())
    return prob_true, prob_pred


class CalibrationCore(BaseEstimator):
    """
    A wrapper class of multiple regressors to predict the proportions of
    positive samples from the predicted probabilities. The method for
    calibration can be 'sigmoid' which corresponds to Platt's method (i.e. a 
    logistic regression model) or 'isotonic' which is a non-parametric approach.
    It is not advised to use isotonic calibration with too few calibration
    samples (<<1000) since it tends to overfit.

    TODO
    ----
    - 'sigmoid' method is not trivial to implement.
    """
    def __init__(self, 
        method: str = 'isotonic',
    ) -> None:
        """
        Initialization function of CalibrationCore class.

        Parameters
        ----------
        method : {'sigmoid', 'isotonic'}, default='isotonic'
            The method to use for calibration. can be 'sigmoid' which
            corresponds to Platt's method (i.e. a logistic regression model) or
            'isotonic' which is a non-parametric approach. It is not advised to
            use isotonic calibration with too few calibration samples (<<1000)
            since it tends to overfit.

        Raises
        ------
        ValueError
            Sigmoid approach has not been implemented.
        """        
        assert method in ('sigmoid', 'isotonic')
        if method == 'sigmoid':
            raise ValueError('Sigmoid approach has not been implemented.')
        self.method = method

    def fit(self, 
        prob_pred: list[float], 
        prob_true: list[float],
    ) -> Self:
        """
        Fit the underlying regressor using prob_pred, prob_true as training
        data.

        Parameters
        ----------
        prob_pred : list[float]
            Probabilities predicted directly by a model.
        prob_true : list[float]
            Target probabilities to calibrate to.

        Returns
        -------
        Self
            CalibrationCore object.
        """              
        # using Platt's method for calibration
        if self.method == 'sigmoid':
            self.model_ = LogisticRegression()
            self.model_.fit(prob_pred, prob_true)

        # using isotonic calibration
        elif self.method == 'isotonic':
            self.model_ = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip')
            self.model_.fit(prob_pred, prob_true)

        return self

    def predict(self,
        prob_pred: list[float],
    ) -> list[float]:
        """
        Calibrate the input probabilities using the fitted regressor.

        Parameters
        ----------
        prob_pred : list[float]
            Probabilities predicted directly by a model.

        Returns
        -------
        prob_cali : list[float]
            Calibrated probabilities.
        """        
        # as usual, the core needs to be fitted
        check_is_fitted(self)

        # note that logistic regression is classification model, we need to call
        # 'predict_proba' instead of 'predict' to get the calibrated results
        if self.method == 'sigmoid':
            prob_cali = self.model_.predict_proba(prob_pred)
        elif self.method == 'isotonic':
            prob_cali = self.model_.predict(prob_pred)

        return prob_cali
    

class CalibratedClassifier(ABC):
    """
    Abstract class of calibrated classifier.
    """
    def __init__(self, 
        model: ADRDModel,
        background_src: list[dict[str, Any]],
        background_tgt: list[dict[str, Any]],
        background_is_embedding: dict[str, bool] | None = None,
        method: str = 'isotonic',
    ) -> None:
        """
        Constructor of Calibrator class.

        Parameters
        ----------
        model : ADRDModel
            Fitted model to calibrate.
        background_src : list[dict[str, Any]]
            Features of the background dataset.
        background_tgt : list[dict[str, Any]]
            Labels of the background dataset.
        method : {'sigmoid', 'isotonic'}, default='isotonic'
            Method used by the underlying regressor. 
        """
        self.method = method
        self.model = model
        self.src_modalities = model.src_modalities
        self.tgt_modalities = model.tgt_modalities
        self.background_is_embedding = background_is_embedding

        # format background data
        fmt_src = Formatter(self.src_modalities)
        fmt_tgt = Formatter(self.tgt_modalities)
        self.background_src = [fmt_src(smp) for smp in background_src]
        self.background_tgt = [fmt_tgt(smp) for smp in background_tgt]
    
    @abstractmethod
    def predict_proba(self, 
        src: list[dict[str, Any]],
        is_embedding: dict[str, bool] | None = None,
    ) -> list[dict[str, float]]:
        """
        This method returns calibrated probabilities of classification.

        Parameters
        ----------
        src : list[dict[str, Any]]
            Features of the input samples.

        Returns
        -------
        list[dict[str, float]]
            Calibrated probabilities.
        """ 
        pass

    def predict(self,
        src: list[dict[str, Any]],
        is_embedding: dict[str, bool] | None = None,
    ) -> list[dict[str, int]]:
        """
        Make predictions based on the results of predict_proba().

        Parameters
        ----------
        x : list[dict[str, Any]]
            Input features.

        Returns
        -------
        list[dict[str, int]]
            Calibrated predictions.
        """
        proba = self.predict_proba(src, is_embedding)
        return [{k: int(smp[k] > 0.5) for k in self.tgt_modalities} for smp in proba]

    def save(self,
        filepath_state_dict: str,
    ) -> None:
        """
        Save the state dict and the underlying model to the given paths.

        Parameters
        ----------
        filepath_state_dict : str
            File path to save the state_dict which includes the background
            dataset and the regressor information.
        filepath_wrapped_model : str | None, default=None
            File path to save the wrapped model. If None, the model won't be
            saved. 
        """
        # save state dict
        state_dict = {
            'background_src': self.background_src,
            'background_tgt': self.background_tgt,
            'background_is_embedding': self.background_is_embedding,
            'method': self.method,
        }
        with open(filepath_state_dict, 'wb') as f:
            dump(state_dict, f)

    @classmethod
    def from_ckpt(cls,
        filepath_state_dict: str,
        filepath_wrapped_model: str,
    ) -> Self:
        """
        Alternative constructor which loads from checkpoint.

        Parameters
        ----------
        filepath_state_dict : str
            File path to load the state_dict which includes the background
            dataset and the regressor information.
        filepath_wrapped_model : str
            File path of the wrapped model.

        Returns
        -------
        Self
            CalibratedClassifier class object.
        """
        with open(filepath_state_dict, 'rb') as f:
            kwargs = load(f)
        kwargs['model'] = ADRDModel.from_ckpt(filepath_wrapped_model)
        return cls(**kwargs)


class DynamicCalibratedClassifier(CalibratedClassifier):
    """
    The dynamic approach generates background predictions based on the
    missingness pattern of each input. With an astronomical number of
    missingness patterns, calibrating each sample requires a comprehensive
    process that involves running the ADRDModel on the majority of the
    background data and training a corresponding regressor. This results in a
    computationally intensive calculation.
    """
    def predict_proba(self,
        src: list[dict[str, Any]],
        is_embedding: dict[str, bool] | None = None,
    ) -> list[dict[str, float]]:
        
        # initialize mask generator and format inputs
        msk_gen = MissingMasker(self.src_modalities)
        fmt_src = Formatter(self.src_modalities)
        src = [fmt_src(smp) for smp in src]

        # calculate calibrated probabilities
        calibrated_prob: list[dict[str, float]] = []
        for smp in src:
            # model output and missingness pattern
            prob = self.model.predict_proba([smp], is_embedding)[0]
            mask = tuple(msk_gen(smp).values())

            # get/fit core and calculate calibrated probabilities
            core = self._fit_core(mask)
            calibrated_prob.append({k: core[k].predict([prob[k]])[0] for k in self.tgt_modalities})

        return calibrated_prob
    
    # @lru_cache(maxsize = None)
    def _fit_core(self,
        missingness_pattern: tuple[bool],
    ) -> dict[str, CalibrationCore]:
        ''' ... ''' 
        # remove features from all background samples accordingly
        background_src, background_tgt = [], []
        for src, tgt in zip(self.background_src, self.background_tgt):
            src = {k: v for j, (k, v) in enumerate(src.items()) if missingness_pattern[j] == False}

            # make sure there is at least one feature available
            if len([v is not None for v in src.values()]) == 0: continue
            background_src.append(src)
            background_tgt.append(tgt)

        # run model on background samples and collection predictions
        background_prob = self.model.predict_proba(background_src, self.background_is_embedding, _batch_size=1024)

        # list[dict] -> dict[list]
        N = len(background_src)
        background_prob = {k: [background_prob[i][k] for i in range(N)] for k in self.tgt_modalities}
        background_true = {k: [background_tgt[i][k] for i in range(N)] for k in self.tgt_modalities}

        # now, fit cores
        core: dict[str, CalibrationCore] = dict()
        for k in self.tgt_modalities:
            prob_true, prob_pred = calibration_curve(
                background_true[k], background_prob[k],
                ratio = self.background_ratio[k],
            )
            core[k] = CalibrationCore(self.method).fit(prob_pred, prob_true)
        
        return core
    
    @cached_property
    def background_ratio(self) -> dict[str, float]:
        ''' The ratio of positives over negatives in the background dataset. '''
        return {k: self.background_n_pos[k] / self.background_n_neg[k] for k in self.tgt_modalities}

    @cached_property
    def background_n_pos(self) -> dict[str, int]:
        ''' Number of positives w.r.t each target in the background dataset. '''
        return {k: sum([d[k] for d in self.background_tgt]) for k in self.tgt_modalities}

    @cached_property
    def background_n_neg(self) -> dict[str, int]:
        ''' Number of negatives w.r.t each target in the background dataset. '''
        return {k: len(self.background_tgt) - self.background_n_pos[k] for k in self.tgt_modalities}


class StaticCalibratedClassifier(CalibratedClassifier):
    """
    The static approach generates background predictions without considering the
    missingness patterns.
    """
    def predict_proba(self,
        src: list[dict[str, Any]],
        is_embedding: dict[str, bool] | None = None,
    ) -> list[dict[str, float]]:

        # number of input samples
        N = len(src)

        # format inputs, and run ADRDModel, and convert to dict[list]
        fmt_src = Formatter(self.src_modalities)
        src = [fmt_src(smp) for smp in src]
        prob = self.model.predict_proba(src, is_embedding)
        prob = {k: [prob[i][k] for i in range(N)] for k in self.tgt_modalities}

        # calibrate probabilities
        core = self._fit_core()
        calibrated_prob = {k: core[k].predict(prob[k]) for k in self.tgt_modalities}

        # convert back to list[dict]
        calibrated_prob: list[dict[str, float]] = [
            {k: calibrated_prob[k][i] for k in self.tgt_modalities} for i in range(N)
        ]
        return calibrated_prob
    
    @lru_cache(maxsize = None)
    def _fit_core(self) -> dict[str, CalibrationCore]:
        ''' ... '''
        # run model on background samples and collection predictions
        background_prob = self.model.predict_proba(self.background_src, self.background_is_embedding, _batch_size=1024)

        # list[dict] -> dict[list]
        N = len(self.background_src)
        background_prob = {k: [background_prob[i][k] for i in range(N)] for k in self.tgt_modalities}
        background_true = {k: [self.background_tgt[i][k] for i in range(N)] for k in self.tgt_modalities}

        # now, fit cores
        core: dict[str, CalibrationCore] = dict()
        for k in self.tgt_modalities:
            prob_true, prob_pred = calibration_curve(
                background_true[k], background_prob[k],
                ratio = 1.0,
            )
            core[k] = CalibrationCore(self.method).fit(prob_pred, prob_true)
        
        return core