Spaces:

markytools
/

strexp

Build error

App Files Files Community

strexp / captum /_utils /av.py

markytools

added strexp

d61b9c7 about 2 years ago

raw

history blame contribute delete

20.4 kB

	#!/usr/bin/env python3

	import glob
	import os
	import re
	import warnings
	from typing import Any, List, Optional, Tuple, Union

	import captum._utils.common as common
	import torch
	from captum.attr import LayerActivation
	from torch import Tensor
	from torch.nn import Module
	from torch.utils.data import DataLoader, Dataset


	class AV:
	r"""
	This class provides functionality to store and load activation vectors
	generated for pre-defined neural network layers.
	It also provides functionality to check if activation vectors already
	exist in the manifold and other auxiliary functions.

	This class also defines a torch `Dataset`, representing Activation Vectors,
	which enables lazy access to activation vectors and layer stored in the manifold.

	"""

	r"""
	The name of the subfolder in the manifold where the activation vectors
	are stored.
	"""

	class AVDataset(Dataset):
	r"""
	This dataset enables access to activation vectors for a given `model` stored
	under a pre-defined path.
	The iterator of this dataset returns a batch of data tensors.
	Additionally, subsets of the model activations can be loaded based on layer
	or identifier or num_id (representing batch number in source dataset).
	"""

	def __init__(
	self,
	path: str,
	model_id: str,
	identifier: Optional[str] = None,
	layer: Optional[str] = None,
	num_id: Optional[str] = None,
	):
	r"""
	Loads into memory the list of all activation file paths associated
	with the input `model_id`.

	Args:
	path (str): The path where the activation vectors
	for the `layer` are stored.
	model_id (str): The name/version of the model for which layer
	activations are being computed and stored.
	identifier (str or None): An optional identifier for the layer
	activations. Can be used to distinguish between activations for
	different training batches.
	layer (str or None): The layer for which the activation vectors
	are computed.
	num_id (str): An optional string representing the batch number for
	which the activation vectors are computed
	"""

	self.av_filesearch = AV._construct_file_search(
	path, model_id, identifier, layer, num_id
	)

	files = glob.glob(self.av_filesearch)

	self.files = AV.sort_files(files)

	def __getitem__(self, idx: int) -> Union[Tensor, Tuple[Tensor, ...]]:
	assert idx < len(self.files), "Layer index is out of bounds!"
	fl = self.files[idx]
	av = torch.load(fl)
	return av

	def __len__(self):
	return len(self.files)

	AV_DIR_NAME: str = "av"

	def __init__(self) -> None:
	pass

	@staticmethod
	def _assemble_model_dir(path: str, model_id: str) -> str:
	r"""
	Returns a directory path for the given source path `path` and `model_id.`
	This path is suffixed with the '/' delimiter.
	"""
	return "/".join([path, AV.AV_DIR_NAME, model_id, ""])

	@staticmethod
	def _assemble_file_path(source_dir: str, identifier: str, layer: str) -> str:
	r"""
	Returns a full filepath given a source directory, layer, and required
	identifier. The source dir is not required to end with a "/" delimiter.
	"""
	if not source_dir.endswith("/"):
	source_dir += "/"

	filepath = os.path.join(source_dir, identifier)

	filepath = os.path.join(filepath, layer)

	return filepath

	@staticmethod
	def _construct_file_search(
	source_dir: str,
	model_id: str,
	identifier: Optional[str] = None,
	layer: Optional[str] = None,
	num_id: Optional[str] = None,
	) -> str:
	r"""
	Returns a search string that can be used by glob to search `source_dir/model_id`
	for the desired layer/identifier pair. Leaving `layer` as None will search ids
	over all layers, and leaving `identifier` as none will search layers over all
	ids. Leaving both as none will return a path to glob for every activation.
	Assumes identifier is always specified when saving activations, so that
	activations live at source_dir/model_id/identifier/layer
	(and never source_dir/model_id/layer)
	"""

	av_filesearch = AV._assemble_model_dir(source_dir, model_id)

	av_filesearch = os.path.join(
	av_filesearch, "*" if identifier is None else identifier
	)

	av_filesearch = os.path.join(av_filesearch, "*" if layer is None else layer)

	av_filesearch = os.path.join(
	av_filesearch, "*.pt" if num_id is None else "%s.pt" % num_id
	)

	return av_filesearch

	@staticmethod
	def exists(
	path: str,
	model_id: str,
	identifier: Optional[str] = None,
	layer: Optional[str] = None,
	num_id: Optional[str] = None,
	) -> bool:
	r"""
	Verifies whether the model + layer activations exist
	under the path.

	Args:
	path (str): The path where the activation vectors
	for the `model_id` are stored.
	model_id (str): The name/version of the model for which layer activations
	are being computed and stored.
	identifier (str or None): An optional identifier for the layer activations.
	Can be used to distinguish between activations for different
	training batches. For example, the id could be a suffix composed of
	a train/test label and numerical value, such as "-train-xxxxx".
	The numerical id is often a monotonic sequence taken from datetime.
	layer (str or None): The layer for which the activation vectors are
	computed.
	num_id (str): An optional string representing the batch number for which
	the activation vectors are computed

	Returns:
	exists (bool): Indicating whether the activation vectors for the `layer`
	and `identifier` (if provided) and num_id (if provided) were stored
	in the manifold. If no `identifier` is provided, will return `True`
	if any layer activation exists, whether it has an identifier or
	not, and vice-versa.
	"""
	av_dir = AV._assemble_model_dir(path, model_id)
	av_filesearch = AV._construct_file_search(
	path, model_id, identifier, layer, num_id
	)
	return os.path.exists(av_dir) and len(glob.glob(av_filesearch)) > 0

	@staticmethod
	def save(
	path: str,
	model_id: str,
	identifier: str,
	layers: Union[str, List[str]],
	act_tensors: Union[Tensor, List[Tensor]],
	num_id: str,
	) -> None:
	r"""
	Saves the activation vectors `act_tensor` for the
	`layer` under the manifold `path`.

	Args:
	path (str): The path where the activation vectors
	for the `layer` are stored.
	model_id (str): The name/version of the model for which layer activations
	are being computed and stored.
	identifier (str or None): An optional identifier for the layer
	activations. Can be used to distinguish between activations for
	different training batches. For example, the identifier could be
	a suffix composed of a train/test label and numerical value, such
	as "-src-abc".
	Additionally, (abc) could be a unique identifying number. For
	example, it is automatically created in
	AV.generate_dataset_activations from batch index.
	It assumes identifier is same for all layers if a list of
	`layers` is provided.
	layers (str or List of str): The layer(s) for which the activation vectors
	are computed.
	act_tensors (Tensor or List of Tensor): A batch of activation vectors.
	This must match the dimension of `layers`.
	num_id (str): string representing the batch number for which the activation
	vectors are computed
	"""
	if isinstance(layers, str):
	layers = [layers]
	if isinstance(act_tensors, Tensor):
	act_tensors = [act_tensors]

	if len(layers) != len(act_tensors):
	raise ValueError("The dimension of `layers` and `act_tensors` must match!")

	av_dir = AV._assemble_model_dir(path, model_id)

	for i, layer in enumerate(layers):
	av_save_fl_path = os.path.join(
	AV._assemble_file_path(av_dir, identifier, layer), "%s.pt" % num_id
	)

	layer_dir = os.path.dirname(av_save_fl_path)
	if not os.path.exists(layer_dir):
	os.makedirs(layer_dir)
	torch.save(act_tensors[i], av_save_fl_path)

	@staticmethod
	def load(
	path: str,
	model_id: str,
	identifier: Optional[str] = None,
	layer: Optional[str] = None,
	num_id: Optional[str] = None,
	) -> AVDataset:
	r"""
	Loads lazily the activation vectors for given `model_id` and
	`layer` saved under the `path`.

	Args:
	path (str): The path where the activation vectors
	for the `layer` are stored.
	model_id (str): The name/version of the model for which layer activations
	are being computed and stored.
	identifier (str or None): An optional identifier for the layer
	activations. Can be used to distinguish between activations for
	different training batches.
	layer (str or None): The layer for which the activation vectors
	are computed.
	num_id (str): An optional string representing the batch number for which
	the activation vectors are computed

	Returns:
	dataset (AV.AVDataset): AV.AVDataset that allows to iterate
	over the activation vectors for given layer, identifier (if
	provided), num_id (if provided). Returning an AV.AVDataset as
	opposed to a DataLoader constructed from it offers more
	flexibility. Raises RuntimeError if activation vectors are not
	found.
	"""

	av_save_dir = AV._assemble_model_dir(path, model_id)

	if os.path.exists(av_save_dir):
	avdataset = AV.AVDataset(path, model_id, identifier, layer, num_id)
	return avdataset
	else:
	raise RuntimeError(
	f"Activation vectors for model {model_id} was not found at path {path}"
	)

	@staticmethod
	def _manage_loading_layers(
	path: str,
	model_id: str,
	layers: Union[str, List[str]],
	load_from_disk: bool = True,
	identifier: Optional[str] = None,
	num_id: Optional[str] = None,
	) -> List[str]:
	r"""
	Returns unsaved layers, and deletes saved layers if load_from_disk is False.

	Args:
	path (str): The path where the activation vectors
	for the `layer` are stored.
	model_id (str): The name/version of the model for which layer activations
	are being computed and stored.
	layers (str or List of str): The layer(s) for which the activation vectors
	are computed.
	identifier (str or None): An optional identifier for the layer
	activations. Can be used to distinguish between activations for
	different training batches.
	num_id (str): An optional string representing the batch number for which the
	activation vectors are computed

	Returns:
	List of layer names for which activations should be generated
	"""

	layers = [layers] if isinstance(layers, str) else layers
	unsaved_layers = []

	if load_from_disk:
	for layer in layers:
	if not AV.exists(path, model_id, identifier, layer, num_id):
	unsaved_layers.append(layer)
	else:
	unsaved_layers = layers
	warnings.warn(
	"Overwriting activations: load_from_disk is set to False. Removing all "
	f"activations matching specified parameters {{path: {path}, "
	f"model_id: {model_id}, layers: {layers}, identifier: {identifier}}} "
	"before generating new activations."
	)
	for layer in layers:
	files = glob.glob(
	AV._construct_file_search(path, model_id, identifier, layer)
	)
	for filename in files:
	os.remove(filename)

	return unsaved_layers

	@staticmethod
	def _compute_and_save_activations(
	path: str,
	model: Module,
	model_id: str,
	layers: Union[str, List[str]],
	inputs: Union[Tensor, Tuple[Tensor, ...]],
	identifier: str,
	num_id: str,
	additional_forward_args: Any = None,
	load_from_disk: bool = True,
	) -> None:
	r"""
	Computes layer activations for the given inputs and specified `layers`

	Args:
	path (str): The path where the activation vectors
	for the `layer` are stored.
	model (torch.nn.Module): An instance of pytorch model. This model should
	define all of its layers as attributes of the model.
	model_id (str): The name/version of the model for which layer activations
	are being computed and stored.
	layers (str or List of str): The layer(s) for which the activation vectors
	are computed.
	inputs (tensor or tuple of tensors): Batch of examples for
	which influential instances are computed. They are passed to the
	input `model`. The first dimension in `inputs` tensor or tuple of
	tensors corresponds to the batch size.
	identifier (str or None): An optional identifier for the layer
	activations. Can be used to distinguish between activations for
	different training batches.
	num_id (str): An required string representing the batch number for which the
	activation vectors are computed
	additional_forward_args (optional): Additional arguments that will be
	passed to `model` after inputs.
	Default: None
	load_from_disk (bool): Forces function to regenerate activations if False.
	Default: True
	"""
	unsaved_layers = AV._manage_loading_layers(
	path,
	model_id,
	layers,
	load_from_disk,
	identifier,
	num_id,
	)
	layer_modules = [
	common._get_module_from_name(model, layer) for layer in unsaved_layers
	]
	if len(unsaved_layers) > 0:
	layer_act = LayerActivation(model, layer_modules)
	new_activations = layer_act.attribute.__wrapped__( # type: ignore
	layer_act, inputs, additional_forward_args
	)
	AV.save(path, model_id, identifier, unsaved_layers, new_activations, num_id)

	@staticmethod
	def _unpack_data(data: Union[Any, Tuple[Any, Any]]) -> Any:
	r"""
	Helper to extract input from labels when getting items from a Dataset. Assumes
	that data is either a single value, or a tuple containing two elements.
	The input could itself be a Tuple containing multiple values. If your
	dataset returns a Tuple with more than 2 elements, please reformat it such that
	all inputs are formatted into a tuple stored at the first position.
	"""
	if isinstance(data, tuple) or isinstance(data, list):
	data = data[0]
	return data

	r"""TODO:
	1. Can propagate saving labels along with activations.
	2. Use of additional_forward_args when sourcing from dataset?
	"""

	@staticmethod
	def generate_dataset_activations(
	path: str,
	model: Module,
	model_id: str,
	layers: Union[str, List[str]],
	dataloader: DataLoader,
	identifier: str = "default",
	load_from_disk: bool = True,
	return_activations: bool = False,
	) -> Optional[Union[AVDataset, List[AVDataset]]]:
	r"""
	Computes layer activations for a source dataset and specified `layers`. Assumes
	that the dataset returns a single value, or a tuple containing two elements
	(see AV._unpack_data).

	Args:
	path (str): The path where the activation vectors
	for the `layer` are stored.
	module (torch.nn.Module): An instance of pytorch model. This model should
	define all of its layers as attributes of the model.
	model_id (str): The name/version of the model for which layer activations
	are being computed and stored.
	layers (str or List of str): The layer(s) for which the activation vectors
	are computed.
	dataloader (torch.utils.data.DataLoader): DataLoader that yields Dataset
	for which influential instances are computed. They are passed to
	input `model`.
	identifier (str or None): An identifier for the layer
	activations. Can be used to distinguish between activations for
	different training batches.
	Default: "default"
	load_from_disk (bool): Forces function to regenerate activations if False.
	Default: True
	return_activations (bool, optional): Whether to return the activations.
	Default: False
	Returns: If `return_activations == True`, returns a single `AVDataset` if
	`layers` is a str, otherwise, a list of `AVDataset`s of the length
	of `layers`, where each element corresponds to a layer. In either
	case, `AVDataset`'s represent the activations for a single layer,
	over the entire `dataloader`. If `return_activations == False`,
	does not return anything.

	"""

	unsaved_layers = AV._manage_loading_layers(
	path,
	model_id,
	layers,
	load_from_disk,
	identifier,
	)
	if len(unsaved_layers) > 0:
	for i, data in enumerate(dataloader):
	AV._compute_and_save_activations(
	path,
	model,
	model_id,
	layers,
	AV._unpack_data(data),
	identifier,
	str(i),
	)

	if not return_activations:
	return None
	if isinstance(layers, str):
	return AV.load(path, model_id, identifier, layers)
	else:
	return [AV.load(path, model_id, identifier, layer) for layer in layers]

	@staticmethod
	def sort_files(files: List[str]) -> List[str]:
	r"""
	Utility for sorting files based on natural sorting instead of the default
	lexigraphical sort.
	"""

	def split_alphanum(s):
	r"""
	Splits string into a list of strings and numbers
	"z23a" -> ["z", 23, "a"]
	"""

	return [int(x) if x.isdigit() else x for x in re.split("([0-9]+)", s)]

	return sorted(files, key=split_alphanum)