File size: 20,392 Bytes
d61b9c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
#!/usr/bin/env python3

import glob
import os
import re
import warnings
from typing import Any, List, Optional, Tuple, Union

import captum._utils.common as common
import torch
from captum.attr import LayerActivation
from torch import Tensor
from torch.nn import Module
from torch.utils.data import DataLoader, Dataset


class AV:
    r"""
    This class provides functionality to store and load activation vectors
    generated for pre-defined neural network layers.
    It also provides functionality to check if activation vectors already
    exist in the manifold and other auxiliary functions.

    This class also defines a torch `Dataset`, representing Activation Vectors,
    which enables lazy access to activation vectors and layer stored in the manifold.

    """

    r"""
        The name of the subfolder in the manifold where the activation vectors
        are stored.
    """

    class AVDataset(Dataset):
        r"""
        This dataset enables access to activation vectors for a given `model` stored
        under a pre-defined path.
        The iterator of this dataset returns a batch of data tensors.
        Additionally, subsets of the model activations can be loaded based on layer
        or identifier or num_id (representing batch number in source dataset).
        """

        def __init__(
            self,
            path: str,
            model_id: str,
            identifier: Optional[str] = None,
            layer: Optional[str] = None,
            num_id: Optional[str] = None,
        ):
            r"""
            Loads into memory the list of all activation file paths associated
            with the input `model_id`.

            Args:
                path (str): The path where the activation vectors
                        for the `layer` are stored.
                model_id (str): The name/version of the model for which layer
                        activations are being computed and stored.
                identifier (str or None): An optional identifier for the layer
                        activations. Can be used to distinguish between activations for
                        different training batches.
                layer (str or None): The layer for which the activation vectors
                        are computed.
                num_id (str): An optional string representing the batch number for
                    which the activation vectors are computed
            """

            self.av_filesearch = AV._construct_file_search(
                path, model_id, identifier, layer, num_id
            )

            files = glob.glob(self.av_filesearch)

            self.files = AV.sort_files(files)

        def __getitem__(self, idx: int) -> Union[Tensor, Tuple[Tensor, ...]]:
            assert idx < len(self.files), "Layer index is out of bounds!"
            fl = self.files[idx]
            av = torch.load(fl)
            return av

        def __len__(self):
            return len(self.files)

    AV_DIR_NAME: str = "av"

    def __init__(self) -> None:
        pass

    @staticmethod
    def _assemble_model_dir(path: str, model_id: str) -> str:
        r"""
        Returns a directory path for the given source path `path` and `model_id.`
        This path is suffixed with the '/' delimiter.
        """
        return "/".join([path, AV.AV_DIR_NAME, model_id, ""])

    @staticmethod
    def _assemble_file_path(source_dir: str, identifier: str, layer: str) -> str:
        r"""
        Returns a full filepath given a source directory, layer, and required
        identifier. The source dir is not required to end with a "/" delimiter.
        """
        if not source_dir.endswith("/"):
            source_dir += "/"

        filepath = os.path.join(source_dir, identifier)

        filepath = os.path.join(filepath, layer)

        return filepath

    @staticmethod
    def _construct_file_search(
        source_dir: str,
        model_id: str,
        identifier: Optional[str] = None,
        layer: Optional[str] = None,
        num_id: Optional[str] = None,
    ) -> str:
        r"""
        Returns a search string that can be used by glob to search `source_dir/model_id`
        for the desired layer/identifier pair. Leaving `layer` as None will search ids
        over all layers, and leaving `identifier` as none will search layers over all
        ids.  Leaving both as none will return a path to glob for every activation.
        Assumes identifier is always specified when saving activations, so that
        activations live at source_dir/model_id/identifier/layer
        (and never source_dir/model_id/layer)
        """

        av_filesearch = AV._assemble_model_dir(source_dir, model_id)

        av_filesearch = os.path.join(
            av_filesearch, "*" if identifier is None else identifier
        )

        av_filesearch = os.path.join(av_filesearch, "*" if layer is None else layer)

        av_filesearch = os.path.join(
            av_filesearch, "*.pt" if num_id is None else "%s.pt" % num_id
        )

        return av_filesearch

    @staticmethod
    def exists(
        path: str,
        model_id: str,
        identifier: Optional[str] = None,
        layer: Optional[str] = None,
        num_id: Optional[str] = None,
    ) -> bool:
        r"""
        Verifies whether the model + layer activations exist
        under the path.

        Args:
            path (str): The path where the activation vectors
                    for the `model_id` are stored.
            model_id (str): The name/version of the model for which layer activations
                    are being computed and stored.
            identifier (str or None): An optional identifier for the layer activations.
                    Can be used to distinguish between activations for different
                    training batches. For example, the id could be a suffix composed of
                    a train/test label and numerical value, such as "-train-xxxxx".
                    The numerical id is often a monotonic sequence taken from datetime.
            layer (str or None): The layer for which the activation vectors are
                    computed.
            num_id (str): An optional string representing the batch number for which
                    the activation vectors are computed

        Returns:
            exists (bool): Indicating whether the activation vectors for the `layer`
                    and `identifier` (if provided) and num_id (if provided) were stored
                    in the manifold. If no `identifier` is provided, will return `True`
                    if any layer activation exists, whether it has an identifier or
                    not, and vice-versa.
        """
        av_dir = AV._assemble_model_dir(path, model_id)
        av_filesearch = AV._construct_file_search(
            path, model_id, identifier, layer, num_id
        )
        return os.path.exists(av_dir) and len(glob.glob(av_filesearch)) > 0

    @staticmethod
    def save(
        path: str,
        model_id: str,
        identifier: str,
        layers: Union[str, List[str]],
        act_tensors: Union[Tensor, List[Tensor]],
        num_id: str,
    ) -> None:
        r"""
        Saves the activation vectors `act_tensor` for the
        `layer` under the manifold `path`.

        Args:
            path (str): The path where the activation vectors
                    for the `layer` are stored.
            model_id (str): The name/version of the model for which layer activations
                    are being computed and stored.
            identifier (str or None): An optional identifier for the layer
                    activations. Can be used to distinguish between activations for
                    different training batches. For example, the identifier could be
                    a suffix composed of a train/test label and numerical value, such
                    as "-src-abc".
                    Additionally, (abc) could be a unique identifying number. For
                    example, it is automatically created in
                    AV.generate_dataset_activations from batch index.
                    It assumes identifier is same for all layers if a list of
                    `layers` is provided.
            layers (str or List of str): The layer(s) for which the activation vectors
                    are computed.
            act_tensors (Tensor or List of Tensor): A batch of activation vectors.
                    This must match the dimension of `layers`.
            num_id (str): string representing the batch number for which the activation
                    vectors are computed
        """
        if isinstance(layers, str):
            layers = [layers]
        if isinstance(act_tensors, Tensor):
            act_tensors = [act_tensors]

        if len(layers) != len(act_tensors):
            raise ValueError("The dimension of `layers` and `act_tensors` must match!")

        av_dir = AV._assemble_model_dir(path, model_id)

        for i, layer in enumerate(layers):
            av_save_fl_path = os.path.join(
                AV._assemble_file_path(av_dir, identifier, layer), "%s.pt" % num_id
            )

            layer_dir = os.path.dirname(av_save_fl_path)
            if not os.path.exists(layer_dir):
                os.makedirs(layer_dir)
            torch.save(act_tensors[i], av_save_fl_path)

    @staticmethod
    def load(
        path: str,
        model_id: str,
        identifier: Optional[str] = None,
        layer: Optional[str] = None,
        num_id: Optional[str] = None,
    ) -> AVDataset:
        r"""
        Loads lazily the activation vectors for given `model_id` and
        `layer` saved under the `path`.

        Args:
            path (str): The path where the activation vectors
                    for the `layer` are stored.
            model_id (str): The name/version of the model for which layer activations
                    are being computed and stored.
            identifier (str or None): An optional identifier for the layer
                    activations. Can be used to distinguish between activations for
                    different training batches.
            layer (str or None): The layer for which the activation vectors
                are computed.
            num_id (str): An optional string representing the batch number for which
                    the activation vectors are computed

        Returns:
            dataset (AV.AVDataset): AV.AVDataset that allows to iterate
                    over the activation vectors for given layer, identifier (if
                    provided), num_id (if provided).  Returning an AV.AVDataset as
                    opposed to a DataLoader constructed from it offers more
                    flexibility.  Raises RuntimeError if activation vectors are not
                    found.
        """

        av_save_dir = AV._assemble_model_dir(path, model_id)

        if os.path.exists(av_save_dir):
            avdataset = AV.AVDataset(path, model_id, identifier, layer, num_id)
            return avdataset
        else:
            raise RuntimeError(
                f"Activation vectors for model {model_id} was not found at path {path}"
            )

    @staticmethod
    def _manage_loading_layers(
        path: str,
        model_id: str,
        layers: Union[str, List[str]],
        load_from_disk: bool = True,
        identifier: Optional[str] = None,
        num_id: Optional[str] = None,
    ) -> List[str]:
        r"""
        Returns unsaved layers, and deletes saved layers if load_from_disk is False.

        Args:
            path (str): The path where the activation vectors
                    for the `layer` are stored.
            model_id (str): The name/version of the model for which layer activations
                    are being computed and stored.
            layers (str or List of str): The layer(s) for which the activation vectors
                    are computed.
            identifier (str or None): An optional identifier for the layer
                    activations. Can be used to distinguish between activations for
                    different training batches.
            num_id (str): An optional string representing the batch number for which the
                    activation vectors are computed

        Returns:
            List of layer names for which activations should be generated
        """

        layers = [layers] if isinstance(layers, str) else layers
        unsaved_layers = []

        if load_from_disk:
            for layer in layers:
                if not AV.exists(path, model_id, identifier, layer, num_id):
                    unsaved_layers.append(layer)
        else:
            unsaved_layers = layers
            warnings.warn(
                "Overwriting activations: load_from_disk is set to False. Removing all "
                f"activations matching specified parameters {{path: {path}, "
                f"model_id: {model_id}, layers: {layers}, identifier: {identifier}}} "
                "before generating new activations."
            )
            for layer in layers:
                files = glob.glob(
                    AV._construct_file_search(path, model_id, identifier, layer)
                )
                for filename in files:
                    os.remove(filename)

        return unsaved_layers

    @staticmethod
    def _compute_and_save_activations(
        path: str,
        model: Module,
        model_id: str,
        layers: Union[str, List[str]],
        inputs: Union[Tensor, Tuple[Tensor, ...]],
        identifier: str,
        num_id: str,
        additional_forward_args: Any = None,
        load_from_disk: bool = True,
    ) -> None:
        r"""
        Computes layer activations for the given inputs and specified `layers`

        Args:
            path (str): The path where the activation vectors
                    for the `layer` are stored.
            model (torch.nn.Module): An instance of pytorch model. This model should
                    define all of its layers as attributes of the model.
            model_id (str): The name/version of the model for which layer activations
                    are being computed and stored.
            layers (str or List of str): The layer(s) for which the activation vectors
                    are computed.
            inputs (tensor or tuple of tensors): Batch of examples for
                    which influential instances are computed. They are passed to the
                    input `model`. The first dimension in `inputs` tensor or tuple of
                    tensors corresponds to the batch size.
            identifier (str or None): An optional identifier for the layer
                    activations. Can be used to distinguish between activations for
                    different training batches.
            num_id (str): An required string representing the batch number for which the
                    activation vectors are computed
            additional_forward_args (optional):  Additional arguments that will be
                    passed to `model` after inputs.
                    Default: None
            load_from_disk (bool): Forces function to regenerate activations if False.
                    Default: True
        """
        unsaved_layers = AV._manage_loading_layers(
            path,
            model_id,
            layers,
            load_from_disk,
            identifier,
            num_id,
        )
        layer_modules = [
            common._get_module_from_name(model, layer) for layer in unsaved_layers
        ]
        if len(unsaved_layers) > 0:
            layer_act = LayerActivation(model, layer_modules)
            new_activations = layer_act.attribute.__wrapped__(  # type: ignore
                layer_act, inputs, additional_forward_args
            )
            AV.save(path, model_id, identifier, unsaved_layers, new_activations, num_id)

    @staticmethod
    def _unpack_data(data: Union[Any, Tuple[Any, Any]]) -> Any:
        r"""
        Helper to extract input from labels when getting items from a Dataset. Assumes
        that data is either a single value, or a tuple containing two elements.
        The input could itself be a Tuple containing multiple values. If your
        dataset returns a Tuple with more than 2 elements, please reformat it such that
        all inputs are formatted into a tuple stored at the first position.
        """
        if isinstance(data, tuple) or isinstance(data, list):
            data = data[0]
        return data

    r"""TODO:
    1. Can propagate saving labels along with activations.
    2. Use of additional_forward_args when sourcing from dataset?
    """

    @staticmethod
    def generate_dataset_activations(
        path: str,
        model: Module,
        model_id: str,
        layers: Union[str, List[str]],
        dataloader: DataLoader,
        identifier: str = "default",
        load_from_disk: bool = True,
        return_activations: bool = False,
    ) -> Optional[Union[AVDataset, List[AVDataset]]]:
        r"""
        Computes layer activations for a source dataset and specified `layers`. Assumes
        that the dataset returns a single value, or a tuple containing two elements
        (see AV._unpack_data).

        Args:
            path (str): The path where the activation vectors
                    for the `layer` are stored.
            module (torch.nn.Module): An instance of pytorch model. This model should
                    define all of its layers as attributes of the model.
            model_id (str): The name/version of the model for which layer activations
                    are being computed and stored.
            layers (str or List of str): The layer(s) for which the activation vectors
                    are computed.
            dataloader (torch.utils.data.DataLoader): DataLoader that yields Dataset
                    for which influential instances are computed. They are passed to
                    input `model`.
            identifier (str or None): An identifier for the layer
                    activations. Can be used to distinguish between activations for
                    different training batches.
                    Default: "default"
            load_from_disk (bool): Forces function to regenerate activations if False.
                    Default: True
            return_activations (bool, optional): Whether to return the activations.
                    Default: False
        Returns: If `return_activations == True`, returns a single `AVDataset` if
                    `layers` is a str, otherwise, a list of `AVDataset`s of the length
                    of `layers`, where each element corresponds to a layer.  In either
                    case, `AVDataset`'s represent the activations for a single layer,
                    over the entire `dataloader`.  If `return_activations == False`,
                    does not return anything.

        """

        unsaved_layers = AV._manage_loading_layers(
            path,
            model_id,
            layers,
            load_from_disk,
            identifier,
        )
        if len(unsaved_layers) > 0:
            for i, data in enumerate(dataloader):
                AV._compute_and_save_activations(
                    path,
                    model,
                    model_id,
                    layers,
                    AV._unpack_data(data),
                    identifier,
                    str(i),
                )

        if not return_activations:
            return None
        if isinstance(layers, str):
            return AV.load(path, model_id, identifier, layers)
        else:
            return [AV.load(path, model_id, identifier, layer) for layer in layers]

    @staticmethod
    def sort_files(files: List[str]) -> List[str]:
        r"""
        Utility for sorting files based on natural sorting instead of the default
        lexigraphical sort.
        """

        def split_alphanum(s):
            r"""
            Splits string into a list of strings and numbers
                "z23a" -> ["z", 23, "a"]
            """

            return [int(x) if x.isdigit() else x for x in re.split("([0-9]+)", s)]

        return sorted(files, key=split_alphanum)