tinyllama-lckv-w10-ft-250b / cache_utils.py

Upload LCKVLlamaForCausalLM

61c49cc verified 26 days ago

23.4 kB

	from typing import Any, Dict, List, Optional, Tuple

	import torch

	from transformers.cache_utils import Cache, DynamicCache, SinkCache

	from .utils import LayerTypeParser


	class IndexedCache(Cache):
	"""
	Similar to the `DynamicCache` class, but with the ability to index the cache by layer index. DynamicCache
	assumes that all layers compute KVs, while IndexedCache allows for a more flexible cache structure.
	"""
	build_position_ids_based_on_cache = False

	def __init__(self) -> None:
	super().__init__()
	self.key_cache: Dict[int, torch.Tensor] = {}
	self.value_cache: Dict[int, torch.Tensor] = {}
	self._seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen
	self._update = True # to prevent the cache from updating when inference with iterations

	def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
	"""
	Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
	sequence length.
	"""
	if layer_idx in self.key_cache:
	return (self.key_cache[layer_idx], self.value_cache[layer_idx])
	else:
	raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")

	def __iter__(self):
	"""
	Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
	keys and values
	"""
	for layer_idx in sorted(self.key_cache.keys()):
	yield (self.key_cache[layer_idx], self.value_cache[layer_idx])

	def __len__(self):
	"""
	Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
	to the number of layers that compute KVs in the model.
	"""
	return len(self.key_cache)

	@property
	def min_layer(self) -> int:
	return min(self.key_cache.keys()) if len(self.key_cache) > 0 else None

	def is_min_layer(self, layer_idx: int) -> bool:
	return self.min_layer is None or self.min_layer == layer_idx

	def update(
	self,
	key_states: torch.Tensor,
	value_states: torch.Tensor,
	layer_idx: int,
	cache_kwargs: Optional[Dict[str, Any]] = None,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

	Parameters:
	key_states (`torch.Tensor`):
	The new key states to cache.
	value_states (`torch.Tensor`):
	The new value states to cache.
	layer_idx (`int`):
	The index of the layer to cache the states for.
	cache_kwargs (`Dict[str, Any]`, `optional`):
	Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.

	Return:
	A tuple containing the updated key and value states.
	"""
	# Update the number of seen tokens
	if self.is_min_layer(layer_idx):
	self._seen_tokens += key_states.shape[-2]

	# Retrieve the cache
	if layer_idx not in self.key_cache:
	new_key_states = key_states
	new_value_states = value_states
	else:
	new_key_states = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
	new_value_states = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)

	# Update the cache
	if self._update:
	self.key_cache[layer_idx] = new_key_states
	self.value_cache[layer_idx] = new_value_states

	return new_key_states, new_value_states

	def get_seq_length(self, layer_idx: Optional[int] = None) -> int:
	"""Returns the sequence length of the cached states. A layer index can be optionally passed."""
	if layer_idx is None:
	layer_idx = self.min_layer

	# TODO: deprecate this function in favor of `cache_position`
	is_empty_layer = (
	(len(self.key_cache) == 0) # no cache in any layer
	or (layer_idx not in self.key_cache) # skipped `layer_idx` and hasn't run a layer with cache after it
	)
	layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0
	return layer_seq_length

	def get_max_length(self) -> Optional[int]:
	"""Returns the maximum sequence length of the cached states. IndexedCache does not have a maximum length."""
	return None

	@classmethod
	def from_cache(cls, dynamic_cache: DynamicCache, args, *kwargs) -> "IndexedCache":
	"""Converts a dynamic cache into an equivalent `IndexedCache`."""
	cache = cls(args, *kwargs)

	cache._seen_tokens = dynamic_cache._seen_tokens
	for layer_idx in range(len(dynamic_cache.key_cache)):
	key_states, value_states = dynamic_cache[layer_idx]
	cache.update(key_states, value_states, layer_idx)

	return cache


	class IndexedSinkCache(Cache):
	"""
	This is a fix to the SinkCache class in the transformers library. It also allows for the cache to be indexed by
	layer index, similar to the `IndexedCache` class.
	"""
	build_position_ids_based_on_cache = True

	def __init__(self, window_length: int = None, num_sink_tokens: int = None) -> None:
	super().__init__()
	self.key_cache: Dict[int, torch.Tensor] = {}
	self.value_cache: Dict[int, torch.Tensor] = {}
	self.window_length = window_length
	self.num_sink_tokens = num_sink_tokens
	self.cos_sin_rerotation_cache = {}
	self._cos_cache = None
	self._sin_cache = None
	self._seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen
	self._update = True # to prevent the cache from updating when inference with iterations

	@staticmethod
	def _rotate_half(x):
	x1 = x[..., : x.shape[-1] // 2]
	x2 = x[..., x.shape[-1] // 2 :]
	return torch.cat((-x2, x1), dim=-1)

	def _apply_key_rotary_pos_emb(
	self, key_states: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
	) -> torch.Tensor:
	rotated_key_states = (key_states * cos) + (self._rotate_half(key_states) * sin)
	return rotated_key_states

	def _get_rerotation_cos_sin(
	self, offset: int, dtype: torch.dtype, cos: torch.Tensor, sin: torch.Tensor
	) -> Tuple[torch.Tensor, torch.Tensor]:
	if offset not in self.cos_sin_rerotation_cache:
	# Upcast to float32 temporarily for better accuracy
	cos = cos.to(torch.float32)
	sin = sin.to(torch.float32)

	# Compute the cos and sin required for back- and forward-rotating to one position earlier in the sequence
	original_cos = cos[self.num_sink_tokens + offset :]
	shifted_cos = cos[self.num_sink_tokens : -offset]
	original_sin = sin[self.num_sink_tokens + offset :]
	shifted_sin = sin[self.num_sink_tokens : -offset]
	rerotation_cos = original_cos * shifted_cos + original_sin * shifted_sin
	rerotation_sin = -original_sin * shifted_cos + original_cos * shifted_sin

	self.cos_sin_rerotation_cache[offset] = (
	rerotation_cos.to(dtype).unsqueeze(0),
	rerotation_sin.to(dtype).unsqueeze(0),
	)
	return self.cos_sin_rerotation_cache[offset]

	@property
	def min_layer(self) -> int:
	return min(self.key_cache.keys()) if len(self.key_cache) > 0 else None

	def is_min_layer(self, layer_idx: int) -> bool:
	return self.min_layer is None or self.min_layer == layer_idx

	def get_seq_length(self, layer_idx: Optional[int] = None) -> int:
	"""Returns the sequence length of the cached states. A layer index can be optionally passed."""
	# TODO: deprecate this function in favor of `cache_position`
	# Workaround to make 'key_states.shape[-2] + past_key_value.get_seq_length(self.layer_idx)' <= window_length
	if layer_idx is None:
	layer_idx = self.min_layer

	if layer_idx not in self.key_cache:
	return 0

	return self.key_cache[layer_idx].shape[-2]

	def get_max_length(self) -> Optional[int]:
	"""Returns the maximum sequence length of the cached states."""
	return self.window_length

	def update(
	self,
	key_states: torch.Tensor,
	value_states: torch.Tensor,
	layer_idx: int,
	cache_kwargs: Optional[Dict[str, Any]] = None,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

	Parameters:
	key_states (`torch.Tensor`):
	The new key states to cache.
	value_states (`torch.Tensor`):
	The new value states to cache.
	layer_idx (`int`):
	The index of the layer to cache the states for.
	cache_kwargs (`Dict[str, Any]`, `optional`):
	Additional arguments for the cache subclass. The following arguments can be used in `SinkCache`: `sin`,
	`cos` and `partial_rotation_size`. These arguments are used with models using RoPE, to recompute the
	rotation as the tokens are shifted.

	Return:
	A tuple containing the updated key and value states.
	"""
	# Optional kwargs for `SinkCache` -- needed on models using RoPE. `partial_rotation_size` is used on models
	# with partially rotated position embeddings, like Phi or Persimmon.
	sin = cache_kwargs.get("sin")
	cos = cache_kwargs.get("cos")
	partial_rotation_size = cache_kwargs.get("partial_rotation_size")
	using_rope = cos is not None and sin is not None

	# Update the number of seen tokens
	if self.is_min_layer(layer_idx):
	self._seen_tokens += key_states.shape[-2]

	# Update the sin/cos cache, which holds sin/cos values for all possible positions
	if using_rope and self.is_min_layer(layer_idx):
	# BC: some models still pass `sin`/`cos` with 2 dims. In those models, they are the full sin/cos. Remove
	# after all RoPE models have a llama-like cache utilization.
	if cos.dim() == 2:
	self._cos_cache = cos
	self._sin_cache = sin
	else:
	if self._cos_cache is None:
	self._cos_cache = cos[0, ...]
	self._sin_cache = sin[0, ...]
	elif self._cos_cache.shape[0] < self.window_length + key_states.shape[-2]:
	self._cos_cache = torch.cat([self._cos_cache[: self.window_length], cos[0, ...]], dim=0)
	self._sin_cache = torch.cat([self._sin_cache[: self.window_length], sin[0, ...]], dim=0)

	# [bsz, num_heads, seq_len, head_dim]
	if layer_idx not in self.key_cache:
	# Empty cache
	new_key_states = key_states
	new_value_states = value_states

	else:
	# Growing cache
	new_key_states = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
	new_value_states = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)

	if self._update:
	self.key_cache[layer_idx] = new_key_states
	self.value_cache[layer_idx] = new_value_states

	# If the cache is full, we need to shift the cache
	if (seq_length := self.get_seq_length(layer_idx)) > self.window_length:
	# Shifting cache
	keys_to_keep = self.key_cache[layer_idx][:, :, -self.window_length + self.num_sink_tokens :]

	# On RoPE models, we need to recompute the Key rotation as the tokens are shifted
	if using_rope:
	rerotation_cos, rerotation_sin = self._get_rerotation_cos_sin(
	seq_length - self.window_length,
	key_states.dtype,
	self._cos_cache[:seq_length],
	self._sin_cache[:seq_length],
	)
	if partial_rotation_size is not None:
	keys_to_keep, keys_pass = (
	keys_to_keep[..., :partial_rotation_size],
	keys_to_keep[..., partial_rotation_size:],
	)
	keys_to_keep = self._apply_key_rotary_pos_emb(keys_to_keep, rerotation_cos, rerotation_sin)
	if partial_rotation_size is not None:
	keys_to_keep = torch.cat((keys_to_keep, keys_pass), dim=-1)

	# Concatenate sink tokens, shifted & rotated tokens (if needed), and new tokens
	sink_keys = self.key_cache[layer_idx][:, :, : self.num_sink_tokens]
	self.key_cache[layer_idx] = torch.cat([sink_keys, keys_to_keep], dim=-2)

	sink_values = self.value_cache[layer_idx][:, :, : self.num_sink_tokens]
	values_to_keep = self.value_cache[layer_idx][:, :, -self.window_length + self.num_sink_tokens :]
	self.value_cache[layer_idx] = torch.cat([sink_values, values_to_keep], dim=-2)

	return new_key_states, new_value_states

	@classmethod
	def from_cache(cls, sink_cache: SinkCache, args, *kwargs) -> "IndexedSinkCache":
	"""Converts a dynamic cache into an equivalent `IndexedCache`."""
	cache = cls(args, *kwargs)

	cache.window_length = sink_cache.window_length
	cache.num_sink_tokens = sink_cache.num_sink_tokens
	cache._seen_tokens = sink_cache._seen_tokens
	cache._cos_cache = sink_cache._cos_cache
	cache._sin_cache = sink_cache._sin_cache
	cache.cos_sin_rerotation_cache = sink_cache.cos_sin_rerotation_cache
	for layer_idx in range(len(sink_cache.key_cache)):
	cache.key_cache[layer_idx] = sink_cache.key_cache[layer_idx]
	cache.value_cache[layer_idx] = sink_cache.value_cache[layer_idx]

	return cache


	class IndexedSlidingWindowCache(IndexedCache):
	"""
	Similar to the `SlidingWindowCache` class, but with the ability to index the cache by layer index. It is no longer
	a subclass of `StaticCache` as it is dynamic.
	"""
	build_position_ids_based_on_cache = False

	def __init__(self, sliding_window: int = None) -> None:
	super().__init__()
	self.sliding_window = sliding_window

	def update(
	self,
	key_states: torch.Tensor,
	value_states: torch.Tensor,
	layer_idx: int,
	cache_kwargs: Optional[Dict[str, Any]] = None,
	) -> Tuple[torch.Tensor]:
	# Update the number of seen tokens
	if self.is_min_layer(layer_idx):
	self._seen_tokens += key_states.shape[-2]

	# [bsz, num_heads, seq_len, head_dim]
	if layer_idx not in self.key_cache:
	# Empty cache
	new_key_states = key_states
	new_value_states = value_states

	else:
	# Growing cache
	new_key_states = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
	new_value_states = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)

	if self._update:
	self.key_cache[layer_idx] = new_key_states
	self.value_cache[layer_idx] = new_value_states

	# If the cache is full, we need to shift the cache
	if self.get_seq_length(layer_idx) > self.sliding_window:
	self.key_cache[layer_idx] = self.key_cache[layer_idx][:, :, -self.sliding_window :]
	self.value_cache[layer_idx] = self.value_cache[layer_idx][:, :, -self.sliding_window :]

	return new_key_states, new_value_states

	def get_max_length(self) -> Optional[int]:
	return self.sliding_window

	@classmethod
	def from_cache(cls, sliding_window_cache: "IndexedSlidingWindowCache", args, *kwargs) -> "IndexedSlidingWindowCache":
	"""This is to override the `from_cache` method in the `IndexedCache` class."""
	cache = cls(args, *kwargs)

	cache._seen_tokens = sliding_window_cache._seen_tokens
	cache.sliding_window = sliding_window_cache.sliding_window
	for layer_idx in range(len(sliding_window_cache.key_cache)):
	cache.key_cache[layer_idx] = sliding_window_cache.key_cache[layer_idx]
	cache.value_cache[layer_idx] = sliding_window_cache.value_cache[layer_idx]

	return cache


	class IndexedHybridCache(IndexedSlidingWindowCache, IndexedCache):
	"""
	Hybrid Cache class to be used for models that alternate between a local sliding window attention and global
	attention in every other layer. Under the hood, Hybrid Cache leverages ["IndexedSlidingWindowCache"] for
	sliding window attention and ["IndexedCache"] for global attention.
	"""
	build_position_ids_based_on_cache = False

	def __init__(self, parser: LayerTypeParser = None, sliding_window: int = None) -> None:
	super().__init__(sliding_window=sliding_window)
	self.parser = parser

	def update(
	self,
	key_states: torch.Tensor,
	value_states: torch.Tensor,
	layer_idx: int,
	cache_kwargs: Optional[Dict[str, Any]] = None,
	) -> Tuple[torch.Tensor]:
	if self.parser[layer_idx].use_sliding_window:
	return IndexedSlidingWindowCache.update(self, key_states, value_states, layer_idx, cache_kwargs)
	else:
	return IndexedCache.update(self, key_states, value_states, layer_idx, cache_kwargs)

	def get_max_length(self) -> Optional[int]:
	return IndexedCache.get_max_length(self)

	@classmethod
	def from_cache(cls, hybrid_cache: "IndexedHybridCache", args, *kwargs) -> "IndexedHybridCache":
	"""This is to override the `from_cache` method in the `IndexedSlidingWindowCache` class."""
	cache = cls(args, *kwargs)

	cache._seen_tokens = hybrid_cache._seen_tokens
	cache.sliding_window = hybrid_cache.sliding_window
	cache.parser = hybrid_cache.parser
	for layer_idx in range(len(hybrid_cache.key_cache)):
	cache.key_cache[layer_idx] = hybrid_cache.key_cache[layer_idx]
	cache.value_cache[layer_idx] = hybrid_cache.value_cache[layer_idx]

	return cache


	class LayerCache(torch.nn.Module):
	"""
	A cache for storing the key-value pairs for layers.
	"""
	def __init__(self) -> None:
	"""
	The placeholder is used to expand the key-value pairs if the layer attends to the top layers.
	Size: (batch_size, num_key_value_heads, 1, head_dim)
	"""
	super().__init__()
	self.key_layer_cache: Dict[int, torch.Tensor] = {}
	self.value_layer_cache: Dict[int, torch.Tensor] = {}
	self.layer_type = None
	self.placeholder = None

	def setup(self, placeholder: torch.Tensor):
	"""setup the cache, calling this function is necessary if there is a layer that attends to the top layers"""
	self.placeholder = placeholder

	def initialize(self, parser: LayerTypeParser, sequence_length: int):
	"""initialize the cache"""
	layers_to_init = {parser[idx].attends_to for idx in range(len(parser)) if parser[idx].attends_top}

	if layers_to_init:
	b, h, _, d = self.placeholder.size()
	init_kvs = self.placeholder.new_zeros((b, h, sequence_length, d))

	for layer_idx in layers_to_init:
	self.layer_append(layer_idx, init_kvs, init_kvs)

	def layer_get(self, layer_idx: int, zerofill: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
	key_states = self.key_layer_cache.get(layer_idx, None)
	value_states = self.value_layer_cache.get(layer_idx, None)

	if zerofill:
	if key_states is None:
	key_states = self.placeholder
	value_states = self.placeholder
	else:
	key_states = torch.cat([self.placeholder, key_states], dim=2)
	value_states = torch.cat([self.placeholder, value_states], dim=2)

	return key_states, value_states

	def layer_set(self, layer_idx: int, key: torch.Tensor, value: torch.Tensor):
	self.key_layer_cache[layer_idx] = key
	self.value_layer_cache[layer_idx] = value

	def layer_append(self, layer_idx: int, key: torch.Tensor, value: torch.Tensor):
	if layer_idx not in self.key_layer_cache:
	self.key_layer_cache[layer_idx] = key
	self.value_layer_cache[layer_idx] = value
	else:
	self.key_layer_cache[layer_idx] = torch.cat([self.key_layer_cache[layer_idx], key], dim=2)
	self.value_layer_cache[layer_idx] = torch.cat([self.value_layer_cache[layer_idx], value], dim=2)


	class LayerIndexedCache(LayerCache, IndexedCache):
	"""
	A cache for storing the key-value pairs for layers, in combination with the ability of standard KV cache.
	"""
	def __init__(self) -> None:
	LayerCache.__init__(self)
	IndexedCache.__init__(self)


	class LayerIndexedSinkCache(LayerCache, IndexedSinkCache):
	"""
	A cache for storing the key-value pairs for layers, in combination with the ability of sink KV cache.
	"""
	def __init__(self) -> None:
	LayerCache.__init__(self)
	IndexedSinkCache.__init__(self)


	class LayerIndexedSlidingWindowCache(LayerCache, IndexedSlidingWindowCache):
	"""
	A cache for storing the key-value pairs for layers, in combination with the ability of sliding window KV cache.
	"""
	def __init__(self) -> None:
	LayerCache.__init__(self)
	IndexedSlidingWindowCache.__init__(self)


	class LayerIndexedHybridCache(LayerCache, IndexedHybridCache):
	"""
	A cache for storing the key-value pairs for layers, in combination with the ability of hybrid KV cache.
	"""
	def __init__(self) -> None:
	LayerCache.__init__(self)
	IndexedHybridCache.__init__(self)


	class AutoLayerCache(torch.nn.Module):
	"""
	AutoLayerCache is a module that automatically creates a cache from an existing cache.
	"""
	CACHE_MAPPING = {
	DynamicCache: LayerIndexedCache,
	SinkCache: LayerIndexedSinkCache,
	IndexedSlidingWindowCache: LayerIndexedSlidingWindowCache,
	IndexedHybridCache: LayerIndexedHybridCache,
	}

	def __init__(self, args, *kwargs):
	raise RuntimeError(
	f"{self.__class__.__name__} is designed to be instantiated "
	f"using the `{self.__class__.__name__}.from_cache(cache)` method."
	)

	@classmethod
	def from_cache(cls, cache: Cache, args, *kwargs):
	"""
	Create a new cache from an existing cache. The new cache will have the same type as the original cache.
	"""
	cache_type = type(cache)
	if cache_type not in cls.CACHE_MAPPING:
	raise ValueError(f"Cache type {cache_type} is not supported by {cls.__name__}.")

	cache_class = cls.CACHE_MAPPING[cache_type]

	if hasattr(cache_class, "from_cache"):
	return cache_class.from_cache(cache, args, *kwargs)
	else:
	# we init an empty cache and copy the attributes
	new_cache = cache_class(args, *kwargs)
	new_cache.__dict__.update(cache.__dict__)
	return new_cache