markytools's picture
added strexp
d61b9c7
#!/usr/bin/env python3
import glob
import os
import re
import warnings
from typing import Any, List, Optional, Tuple, Union
import captum._utils.common as common
import torch
from captum.attr import LayerActivation
from torch import Tensor
from torch.nn import Module
from torch.utils.data import DataLoader, Dataset
class AV:
r"""
This class provides functionality to store and load activation vectors
generated for pre-defined neural network layers.
It also provides functionality to check if activation vectors already
exist in the manifold and other auxiliary functions.
This class also defines a torch `Dataset`, representing Activation Vectors,
which enables lazy access to activation vectors and layer stored in the manifold.
"""
r"""
The name of the subfolder in the manifold where the activation vectors
are stored.
"""
class AVDataset(Dataset):
r"""
This dataset enables access to activation vectors for a given `model` stored
under a pre-defined path.
The iterator of this dataset returns a batch of data tensors.
Additionally, subsets of the model activations can be loaded based on layer
or identifier or num_id (representing batch number in source dataset).
"""
def __init__(
self,
path: str,
model_id: str,
identifier: Optional[str] = None,
layer: Optional[str] = None,
num_id: Optional[str] = None,
):
r"""
Loads into memory the list of all activation file paths associated
with the input `model_id`.
Args:
path (str): The path where the activation vectors
for the `layer` are stored.
model_id (str): The name/version of the model for which layer
activations are being computed and stored.
identifier (str or None): An optional identifier for the layer
activations. Can be used to distinguish between activations for
different training batches.
layer (str or None): The layer for which the activation vectors
are computed.
num_id (str): An optional string representing the batch number for
which the activation vectors are computed
"""
self.av_filesearch = AV._construct_file_search(
path, model_id, identifier, layer, num_id
)
files = glob.glob(self.av_filesearch)
self.files = AV.sort_files(files)
def __getitem__(self, idx: int) -> Union[Tensor, Tuple[Tensor, ...]]:
assert idx < len(self.files), "Layer index is out of bounds!"
fl = self.files[idx]
av = torch.load(fl)
return av
def __len__(self):
return len(self.files)
AV_DIR_NAME: str = "av"
def __init__(self) -> None:
pass
@staticmethod
def _assemble_model_dir(path: str, model_id: str) -> str:
r"""
Returns a directory path for the given source path `path` and `model_id.`
This path is suffixed with the '/' delimiter.
"""
return "/".join([path, AV.AV_DIR_NAME, model_id, ""])
@staticmethod
def _assemble_file_path(source_dir: str, identifier: str, layer: str) -> str:
r"""
Returns a full filepath given a source directory, layer, and required
identifier. The source dir is not required to end with a "/" delimiter.
"""
if not source_dir.endswith("/"):
source_dir += "/"
filepath = os.path.join(source_dir, identifier)
filepath = os.path.join(filepath, layer)
return filepath
@staticmethod
def _construct_file_search(
source_dir: str,
model_id: str,
identifier: Optional[str] = None,
layer: Optional[str] = None,
num_id: Optional[str] = None,
) -> str:
r"""
Returns a search string that can be used by glob to search `source_dir/model_id`
for the desired layer/identifier pair. Leaving `layer` as None will search ids
over all layers, and leaving `identifier` as none will search layers over all
ids. Leaving both as none will return a path to glob for every activation.
Assumes identifier is always specified when saving activations, so that
activations live at source_dir/model_id/identifier/layer
(and never source_dir/model_id/layer)
"""
av_filesearch = AV._assemble_model_dir(source_dir, model_id)
av_filesearch = os.path.join(
av_filesearch, "*" if identifier is None else identifier
)
av_filesearch = os.path.join(av_filesearch, "*" if layer is None else layer)
av_filesearch = os.path.join(
av_filesearch, "*.pt" if num_id is None else "%s.pt" % num_id
)
return av_filesearch
@staticmethod
def exists(
path: str,
model_id: str,
identifier: Optional[str] = None,
layer: Optional[str] = None,
num_id: Optional[str] = None,
) -> bool:
r"""
Verifies whether the model + layer activations exist
under the path.
Args:
path (str): The path where the activation vectors
for the `model_id` are stored.
model_id (str): The name/version of the model for which layer activations
are being computed and stored.
identifier (str or None): An optional identifier for the layer activations.
Can be used to distinguish between activations for different
training batches. For example, the id could be a suffix composed of
a train/test label and numerical value, such as "-train-xxxxx".
The numerical id is often a monotonic sequence taken from datetime.
layer (str or None): The layer for which the activation vectors are
computed.
num_id (str): An optional string representing the batch number for which
the activation vectors are computed
Returns:
exists (bool): Indicating whether the activation vectors for the `layer`
and `identifier` (if provided) and num_id (if provided) were stored
in the manifold. If no `identifier` is provided, will return `True`
if any layer activation exists, whether it has an identifier or
not, and vice-versa.
"""
av_dir = AV._assemble_model_dir(path, model_id)
av_filesearch = AV._construct_file_search(
path, model_id, identifier, layer, num_id
)
return os.path.exists(av_dir) and len(glob.glob(av_filesearch)) > 0
@staticmethod
def save(
path: str,
model_id: str,
identifier: str,
layers: Union[str, List[str]],
act_tensors: Union[Tensor, List[Tensor]],
num_id: str,
) -> None:
r"""
Saves the activation vectors `act_tensor` for the
`layer` under the manifold `path`.
Args:
path (str): The path where the activation vectors
for the `layer` are stored.
model_id (str): The name/version of the model for which layer activations
are being computed and stored.
identifier (str or None): An optional identifier for the layer
activations. Can be used to distinguish between activations for
different training batches. For example, the identifier could be
a suffix composed of a train/test label and numerical value, such
as "-src-abc".
Additionally, (abc) could be a unique identifying number. For
example, it is automatically created in
AV.generate_dataset_activations from batch index.
It assumes identifier is same for all layers if a list of
`layers` is provided.
layers (str or List of str): The layer(s) for which the activation vectors
are computed.
act_tensors (Tensor or List of Tensor): A batch of activation vectors.
This must match the dimension of `layers`.
num_id (str): string representing the batch number for which the activation
vectors are computed
"""
if isinstance(layers, str):
layers = [layers]
if isinstance(act_tensors, Tensor):
act_tensors = [act_tensors]
if len(layers) != len(act_tensors):
raise ValueError("The dimension of `layers` and `act_tensors` must match!")
av_dir = AV._assemble_model_dir(path, model_id)
for i, layer in enumerate(layers):
av_save_fl_path = os.path.join(
AV._assemble_file_path(av_dir, identifier, layer), "%s.pt" % num_id
)
layer_dir = os.path.dirname(av_save_fl_path)
if not os.path.exists(layer_dir):
os.makedirs(layer_dir)
torch.save(act_tensors[i], av_save_fl_path)
@staticmethod
def load(
path: str,
model_id: str,
identifier: Optional[str] = None,
layer: Optional[str] = None,
num_id: Optional[str] = None,
) -> AVDataset:
r"""
Loads lazily the activation vectors for given `model_id` and
`layer` saved under the `path`.
Args:
path (str): The path where the activation vectors
for the `layer` are stored.
model_id (str): The name/version of the model for which layer activations
are being computed and stored.
identifier (str or None): An optional identifier for the layer
activations. Can be used to distinguish between activations for
different training batches.
layer (str or None): The layer for which the activation vectors
are computed.
num_id (str): An optional string representing the batch number for which
the activation vectors are computed
Returns:
dataset (AV.AVDataset): AV.AVDataset that allows to iterate
over the activation vectors for given layer, identifier (if
provided), num_id (if provided). Returning an AV.AVDataset as
opposed to a DataLoader constructed from it offers more
flexibility. Raises RuntimeError if activation vectors are not
found.
"""
av_save_dir = AV._assemble_model_dir(path, model_id)
if os.path.exists(av_save_dir):
avdataset = AV.AVDataset(path, model_id, identifier, layer, num_id)
return avdataset
else:
raise RuntimeError(
f"Activation vectors for model {model_id} was not found at path {path}"
)
@staticmethod
def _manage_loading_layers(
path: str,
model_id: str,
layers: Union[str, List[str]],
load_from_disk: bool = True,
identifier: Optional[str] = None,
num_id: Optional[str] = None,
) -> List[str]:
r"""
Returns unsaved layers, and deletes saved layers if load_from_disk is False.
Args:
path (str): The path where the activation vectors
for the `layer` are stored.
model_id (str): The name/version of the model for which layer activations
are being computed and stored.
layers (str or List of str): The layer(s) for which the activation vectors
are computed.
identifier (str or None): An optional identifier for the layer
activations. Can be used to distinguish between activations for
different training batches.
num_id (str): An optional string representing the batch number for which the
activation vectors are computed
Returns:
List of layer names for which activations should be generated
"""
layers = [layers] if isinstance(layers, str) else layers
unsaved_layers = []
if load_from_disk:
for layer in layers:
if not AV.exists(path, model_id, identifier, layer, num_id):
unsaved_layers.append(layer)
else:
unsaved_layers = layers
warnings.warn(
"Overwriting activations: load_from_disk is set to False. Removing all "
f"activations matching specified parameters {{path: {path}, "
f"model_id: {model_id}, layers: {layers}, identifier: {identifier}}} "
"before generating new activations."
)
for layer in layers:
files = glob.glob(
AV._construct_file_search(path, model_id, identifier, layer)
)
for filename in files:
os.remove(filename)
return unsaved_layers
@staticmethod
def _compute_and_save_activations(
path: str,
model: Module,
model_id: str,
layers: Union[str, List[str]],
inputs: Union[Tensor, Tuple[Tensor, ...]],
identifier: str,
num_id: str,
additional_forward_args: Any = None,
load_from_disk: bool = True,
) -> None:
r"""
Computes layer activations for the given inputs and specified `layers`
Args:
path (str): The path where the activation vectors
for the `layer` are stored.
model (torch.nn.Module): An instance of pytorch model. This model should
define all of its layers as attributes of the model.
model_id (str): The name/version of the model for which layer activations
are being computed and stored.
layers (str or List of str): The layer(s) for which the activation vectors
are computed.
inputs (tensor or tuple of tensors): Batch of examples for
which influential instances are computed. They are passed to the
input `model`. The first dimension in `inputs` tensor or tuple of
tensors corresponds to the batch size.
identifier (str or None): An optional identifier for the layer
activations. Can be used to distinguish between activations for
different training batches.
num_id (str): An required string representing the batch number for which the
activation vectors are computed
additional_forward_args (optional): Additional arguments that will be
passed to `model` after inputs.
Default: None
load_from_disk (bool): Forces function to regenerate activations if False.
Default: True
"""
unsaved_layers = AV._manage_loading_layers(
path,
model_id,
layers,
load_from_disk,
identifier,
num_id,
)
layer_modules = [
common._get_module_from_name(model, layer) for layer in unsaved_layers
]
if len(unsaved_layers) > 0:
layer_act = LayerActivation(model, layer_modules)
new_activations = layer_act.attribute.__wrapped__( # type: ignore
layer_act, inputs, additional_forward_args
)
AV.save(path, model_id, identifier, unsaved_layers, new_activations, num_id)
@staticmethod
def _unpack_data(data: Union[Any, Tuple[Any, Any]]) -> Any:
r"""
Helper to extract input from labels when getting items from a Dataset. Assumes
that data is either a single value, or a tuple containing two elements.
The input could itself be a Tuple containing multiple values. If your
dataset returns a Tuple with more than 2 elements, please reformat it such that
all inputs are formatted into a tuple stored at the first position.
"""
if isinstance(data, tuple) or isinstance(data, list):
data = data[0]
return data
r"""TODO:
1. Can propagate saving labels along with activations.
2. Use of additional_forward_args when sourcing from dataset?
"""
@staticmethod
def generate_dataset_activations(
path: str,
model: Module,
model_id: str,
layers: Union[str, List[str]],
dataloader: DataLoader,
identifier: str = "default",
load_from_disk: bool = True,
return_activations: bool = False,
) -> Optional[Union[AVDataset, List[AVDataset]]]:
r"""
Computes layer activations for a source dataset and specified `layers`. Assumes
that the dataset returns a single value, or a tuple containing two elements
(see AV._unpack_data).
Args:
path (str): The path where the activation vectors
for the `layer` are stored.
module (torch.nn.Module): An instance of pytorch model. This model should
define all of its layers as attributes of the model.
model_id (str): The name/version of the model for which layer activations
are being computed and stored.
layers (str or List of str): The layer(s) for which the activation vectors
are computed.
dataloader (torch.utils.data.DataLoader): DataLoader that yields Dataset
for which influential instances are computed. They are passed to
input `model`.
identifier (str or None): An identifier for the layer
activations. Can be used to distinguish between activations for
different training batches.
Default: "default"
load_from_disk (bool): Forces function to regenerate activations if False.
Default: True
return_activations (bool, optional): Whether to return the activations.
Default: False
Returns: If `return_activations == True`, returns a single `AVDataset` if
`layers` is a str, otherwise, a list of `AVDataset`s of the length
of `layers`, where each element corresponds to a layer. In either
case, `AVDataset`'s represent the activations for a single layer,
over the entire `dataloader`. If `return_activations == False`,
does not return anything.
"""
unsaved_layers = AV._manage_loading_layers(
path,
model_id,
layers,
load_from_disk,
identifier,
)
if len(unsaved_layers) > 0:
for i, data in enumerate(dataloader):
AV._compute_and_save_activations(
path,
model,
model_id,
layers,
AV._unpack_data(data),
identifier,
str(i),
)
if not return_activations:
return None
if isinstance(layers, str):
return AV.load(path, model_id, identifier, layers)
else:
return [AV.load(path, model_id, identifier, layer) for layer in layers]
@staticmethod
def sort_files(files: List[str]) -> List[str]:
r"""
Utility for sorting files based on natural sorting instead of the default
lexigraphical sort.
"""
def split_alphanum(s):
r"""
Splits string into a list of strings and numbers
"z23a" -> ["z", 23, "a"]
"""
return [int(x) if x.isdigit() else x for x in re.split("([0-9]+)", s)]
return sorted(files, key=split_alphanum)