tinyllama-lckv-w10-ft-250b / cache_utils.py
whynlp's picture
Upload LCKVLlamaForCausalLM
61c49cc verified
from typing import Any, Dict, List, Optional, Tuple
import torch
from transformers.cache_utils import Cache, DynamicCache, SinkCache
from .utils import LayerTypeParser
class IndexedCache(Cache):
"""
Similar to the `DynamicCache` class, but with the ability to index the cache by layer index. DynamicCache
assumes that all layers compute KVs, while IndexedCache allows for a more flexible cache structure.
"""
build_position_ids_based_on_cache = False
def __init__(self) -> None:
super().__init__()
self.key_cache: Dict[int, torch.Tensor] = {}
self.value_cache: Dict[int, torch.Tensor] = {}
self._seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen
self._update = True # to prevent the cache from updating when inference with iterations
def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
"""
Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
sequence length.
"""
if layer_idx in self.key_cache:
return (self.key_cache[layer_idx], self.value_cache[layer_idx])
else:
raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
def __iter__(self):
"""
Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
keys and values
"""
for layer_idx in sorted(self.key_cache.keys()):
yield (self.key_cache[layer_idx], self.value_cache[layer_idx])
def __len__(self):
"""
Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
to the number of layers that compute KVs in the model.
"""
return len(self.key_cache)
@property
def min_layer(self) -> int:
return min(self.key_cache.keys()) if len(self.key_cache) > 0 else None
def is_min_layer(self, layer_idx: int) -> bool:
return self.min_layer is None or self.min_layer == layer_idx
def update(
self,
key_states: torch.Tensor,
value_states: torch.Tensor,
layer_idx: int,
cache_kwargs: Optional[Dict[str, Any]] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
Parameters:
key_states (`torch.Tensor`):
The new key states to cache.
value_states (`torch.Tensor`):
The new value states to cache.
layer_idx (`int`):
The index of the layer to cache the states for.
cache_kwargs (`Dict[str, Any]`, `optional`):
Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
Return:
A tuple containing the updated key and value states.
"""
# Update the number of seen tokens
if self.is_min_layer(layer_idx):
self._seen_tokens += key_states.shape[-2]
# Retrieve the cache
if layer_idx not in self.key_cache:
new_key_states = key_states
new_value_states = value_states
else:
new_key_states = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
new_value_states = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
# Update the cache
if self._update:
self.key_cache[layer_idx] = new_key_states
self.value_cache[layer_idx] = new_value_states
return new_key_states, new_value_states
def get_seq_length(self, layer_idx: Optional[int] = None) -> int:
"""Returns the sequence length of the cached states. A layer index can be optionally passed."""
if layer_idx is None:
layer_idx = self.min_layer
# TODO: deprecate this function in favor of `cache_position`
is_empty_layer = (
(len(self.key_cache) == 0) # no cache in any layer
or (layer_idx not in self.key_cache) # skipped `layer_idx` and hasn't run a layer with cache after it
)
layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0
return layer_seq_length
def get_max_length(self) -> Optional[int]:
"""Returns the maximum sequence length of the cached states. IndexedCache does not have a maximum length."""
return None
@classmethod
def from_cache(cls, dynamic_cache: DynamicCache, *args, **kwargs) -> "IndexedCache":
"""Converts a dynamic cache into an equivalent `IndexedCache`."""
cache = cls(*args, **kwargs)
cache._seen_tokens = dynamic_cache._seen_tokens
for layer_idx in range(len(dynamic_cache.key_cache)):
key_states, value_states = dynamic_cache[layer_idx]
cache.update(key_states, value_states, layer_idx)
return cache
class IndexedSinkCache(Cache):
"""
This is a fix to the SinkCache class in the transformers library. It also allows for the cache to be indexed by
layer index, similar to the `IndexedCache` class.
"""
build_position_ids_based_on_cache = True
def __init__(self, window_length: int = None, num_sink_tokens: int = None) -> None:
super().__init__()
self.key_cache: Dict[int, torch.Tensor] = {}
self.value_cache: Dict[int, torch.Tensor] = {}
self.window_length = window_length
self.num_sink_tokens = num_sink_tokens
self.cos_sin_rerotation_cache = {}
self._cos_cache = None
self._sin_cache = None
self._seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen
self._update = True # to prevent the cache from updating when inference with iterations
@staticmethod
def _rotate_half(x):
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
def _apply_key_rotary_pos_emb(
self, key_states: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
) -> torch.Tensor:
rotated_key_states = (key_states * cos) + (self._rotate_half(key_states) * sin)
return rotated_key_states
def _get_rerotation_cos_sin(
self, offset: int, dtype: torch.dtype, cos: torch.Tensor, sin: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
if offset not in self.cos_sin_rerotation_cache:
# Upcast to float32 temporarily for better accuracy
cos = cos.to(torch.float32)
sin = sin.to(torch.float32)
# Compute the cos and sin required for back- and forward-rotating to one position earlier in the sequence
original_cos = cos[self.num_sink_tokens + offset :]
shifted_cos = cos[self.num_sink_tokens : -offset]
original_sin = sin[self.num_sink_tokens + offset :]
shifted_sin = sin[self.num_sink_tokens : -offset]
rerotation_cos = original_cos * shifted_cos + original_sin * shifted_sin
rerotation_sin = -original_sin * shifted_cos + original_cos * shifted_sin
self.cos_sin_rerotation_cache[offset] = (
rerotation_cos.to(dtype).unsqueeze(0),
rerotation_sin.to(dtype).unsqueeze(0),
)
return self.cos_sin_rerotation_cache[offset]
@property
def min_layer(self) -> int:
return min(self.key_cache.keys()) if len(self.key_cache) > 0 else None
def is_min_layer(self, layer_idx: int) -> bool:
return self.min_layer is None or self.min_layer == layer_idx
def get_seq_length(self, layer_idx: Optional[int] = None) -> int:
"""Returns the sequence length of the cached states. A layer index can be optionally passed."""
# TODO: deprecate this function in favor of `cache_position`
# Workaround to make 'key_states.shape[-2] + past_key_value.get_seq_length(self.layer_idx)' <= window_length
if layer_idx is None:
layer_idx = self.min_layer
if layer_idx not in self.key_cache:
return 0
return self.key_cache[layer_idx].shape[-2]
def get_max_length(self) -> Optional[int]:
"""Returns the maximum sequence length of the cached states."""
return self.window_length
def update(
self,
key_states: torch.Tensor,
value_states: torch.Tensor,
layer_idx: int,
cache_kwargs: Optional[Dict[str, Any]] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
Parameters:
key_states (`torch.Tensor`):
The new key states to cache.
value_states (`torch.Tensor`):
The new value states to cache.
layer_idx (`int`):
The index of the layer to cache the states for.
cache_kwargs (`Dict[str, Any]`, `optional`):
Additional arguments for the cache subclass. The following arguments can be used in `SinkCache`: `sin`,
`cos` and `partial_rotation_size`. These arguments are used with models using RoPE, to recompute the
rotation as the tokens are shifted.
Return:
A tuple containing the updated key and value states.
"""
# Optional kwargs for `SinkCache` -- needed on models using RoPE. `partial_rotation_size` is used on models
# with partially rotated position embeddings, like Phi or Persimmon.
sin = cache_kwargs.get("sin")
cos = cache_kwargs.get("cos")
partial_rotation_size = cache_kwargs.get("partial_rotation_size")
using_rope = cos is not None and sin is not None
# Update the number of seen tokens
if self.is_min_layer(layer_idx):
self._seen_tokens += key_states.shape[-2]
# Update the sin/cos cache, which holds sin/cos values for all possible positions
if using_rope and self.is_min_layer(layer_idx):
# BC: some models still pass `sin`/`cos` with 2 dims. In those models, they are the full sin/cos. Remove
# after all RoPE models have a llama-like cache utilization.
if cos.dim() == 2:
self._cos_cache = cos
self._sin_cache = sin
else:
if self._cos_cache is None:
self._cos_cache = cos[0, ...]
self._sin_cache = sin[0, ...]
elif self._cos_cache.shape[0] < self.window_length + key_states.shape[-2]:
self._cos_cache = torch.cat([self._cos_cache[: self.window_length], cos[0, ...]], dim=0)
self._sin_cache = torch.cat([self._sin_cache[: self.window_length], sin[0, ...]], dim=0)
# [bsz, num_heads, seq_len, head_dim]
if layer_idx not in self.key_cache:
# Empty cache
new_key_states = key_states
new_value_states = value_states
else:
# Growing cache
new_key_states = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
new_value_states = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
if self._update:
self.key_cache[layer_idx] = new_key_states
self.value_cache[layer_idx] = new_value_states
# If the cache is full, we need to shift the cache
if (seq_length := self.get_seq_length(layer_idx)) > self.window_length:
# Shifting cache
keys_to_keep = self.key_cache[layer_idx][:, :, -self.window_length + self.num_sink_tokens :]
# On RoPE models, we need to recompute the Key rotation as the tokens are shifted
if using_rope:
rerotation_cos, rerotation_sin = self._get_rerotation_cos_sin(
seq_length - self.window_length,
key_states.dtype,
self._cos_cache[:seq_length],
self._sin_cache[:seq_length],
)
if partial_rotation_size is not None:
keys_to_keep, keys_pass = (
keys_to_keep[..., :partial_rotation_size],
keys_to_keep[..., partial_rotation_size:],
)
keys_to_keep = self._apply_key_rotary_pos_emb(keys_to_keep, rerotation_cos, rerotation_sin)
if partial_rotation_size is not None:
keys_to_keep = torch.cat((keys_to_keep, keys_pass), dim=-1)
# Concatenate sink tokens, shifted & rotated tokens (if needed), and new tokens
sink_keys = self.key_cache[layer_idx][:, :, : self.num_sink_tokens]
self.key_cache[layer_idx] = torch.cat([sink_keys, keys_to_keep], dim=-2)
sink_values = self.value_cache[layer_idx][:, :, : self.num_sink_tokens]
values_to_keep = self.value_cache[layer_idx][:, :, -self.window_length + self.num_sink_tokens :]
self.value_cache[layer_idx] = torch.cat([sink_values, values_to_keep], dim=-2)
return new_key_states, new_value_states
@classmethod
def from_cache(cls, sink_cache: SinkCache, *args, **kwargs) -> "IndexedSinkCache":
"""Converts a dynamic cache into an equivalent `IndexedCache`."""
cache = cls(*args, **kwargs)
cache.window_length = sink_cache.window_length
cache.num_sink_tokens = sink_cache.num_sink_tokens
cache._seen_tokens = sink_cache._seen_tokens
cache._cos_cache = sink_cache._cos_cache
cache._sin_cache = sink_cache._sin_cache
cache.cos_sin_rerotation_cache = sink_cache.cos_sin_rerotation_cache
for layer_idx in range(len(sink_cache.key_cache)):
cache.key_cache[layer_idx] = sink_cache.key_cache[layer_idx]
cache.value_cache[layer_idx] = sink_cache.value_cache[layer_idx]
return cache
class IndexedSlidingWindowCache(IndexedCache):
"""
Similar to the `SlidingWindowCache` class, but with the ability to index the cache by layer index. It is no longer
a subclass of `StaticCache` as it is dynamic.
"""
build_position_ids_based_on_cache = False
def __init__(self, sliding_window: int = None) -> None:
super().__init__()
self.sliding_window = sliding_window
def update(
self,
key_states: torch.Tensor,
value_states: torch.Tensor,
layer_idx: int,
cache_kwargs: Optional[Dict[str, Any]] = None,
) -> Tuple[torch.Tensor]:
# Update the number of seen tokens
if self.is_min_layer(layer_idx):
self._seen_tokens += key_states.shape[-2]
# [bsz, num_heads, seq_len, head_dim]
if layer_idx not in self.key_cache:
# Empty cache
new_key_states = key_states
new_value_states = value_states
else:
# Growing cache
new_key_states = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
new_value_states = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
if self._update:
self.key_cache[layer_idx] = new_key_states
self.value_cache[layer_idx] = new_value_states
# If the cache is full, we need to shift the cache
if self.get_seq_length(layer_idx) > self.sliding_window:
self.key_cache[layer_idx] = self.key_cache[layer_idx][:, :, -self.sliding_window :]
self.value_cache[layer_idx] = self.value_cache[layer_idx][:, :, -self.sliding_window :]
return new_key_states, new_value_states
def get_max_length(self) -> Optional[int]:
return self.sliding_window
@classmethod
def from_cache(cls, sliding_window_cache: "IndexedSlidingWindowCache", *args, **kwargs) -> "IndexedSlidingWindowCache":
"""This is to override the `from_cache` method in the `IndexedCache` class."""
cache = cls(*args, **kwargs)
cache._seen_tokens = sliding_window_cache._seen_tokens
cache.sliding_window = sliding_window_cache.sliding_window
for layer_idx in range(len(sliding_window_cache.key_cache)):
cache.key_cache[layer_idx] = sliding_window_cache.key_cache[layer_idx]
cache.value_cache[layer_idx] = sliding_window_cache.value_cache[layer_idx]
return cache
class IndexedHybridCache(IndexedSlidingWindowCache, IndexedCache):
"""
Hybrid Cache class to be used for models that alternate between a local sliding window attention and global
attention in every other layer. Under the hood, Hybrid Cache leverages ["IndexedSlidingWindowCache"] for
sliding window attention and ["IndexedCache"] for global attention.
"""
build_position_ids_based_on_cache = False
def __init__(self, parser: LayerTypeParser = None, sliding_window: int = None) -> None:
super().__init__(sliding_window=sliding_window)
self.parser = parser
def update(
self,
key_states: torch.Tensor,
value_states: torch.Tensor,
layer_idx: int,
cache_kwargs: Optional[Dict[str, Any]] = None,
) -> Tuple[torch.Tensor]:
if self.parser[layer_idx].use_sliding_window:
return IndexedSlidingWindowCache.update(self, key_states, value_states, layer_idx, cache_kwargs)
else:
return IndexedCache.update(self, key_states, value_states, layer_idx, cache_kwargs)
def get_max_length(self) -> Optional[int]:
return IndexedCache.get_max_length(self)
@classmethod
def from_cache(cls, hybrid_cache: "IndexedHybridCache", *args, **kwargs) -> "IndexedHybridCache":
"""This is to override the `from_cache` method in the `IndexedSlidingWindowCache` class."""
cache = cls(*args, **kwargs)
cache._seen_tokens = hybrid_cache._seen_tokens
cache.sliding_window = hybrid_cache.sliding_window
cache.parser = hybrid_cache.parser
for layer_idx in range(len(hybrid_cache.key_cache)):
cache.key_cache[layer_idx] = hybrid_cache.key_cache[layer_idx]
cache.value_cache[layer_idx] = hybrid_cache.value_cache[layer_idx]
return cache
class LayerCache(torch.nn.Module):
"""
A cache for storing the key-value pairs for layers.
"""
def __init__(self) -> None:
"""
The placeholder is used to expand the key-value pairs if the layer attends to the top layers.
Size: (batch_size, num_key_value_heads, 1, head_dim)
"""
super().__init__()
self.key_layer_cache: Dict[int, torch.Tensor] = {}
self.value_layer_cache: Dict[int, torch.Tensor] = {}
self.layer_type = None
self.placeholder = None
def setup(self, placeholder: torch.Tensor):
"""setup the cache, calling this function is necessary if there is a layer that attends to the top layers"""
self.placeholder = placeholder
def initialize(self, parser: LayerTypeParser, sequence_length: int):
"""initialize the cache"""
layers_to_init = {parser[idx].attends_to for idx in range(len(parser)) if parser[idx].attends_top}
if layers_to_init:
b, h, _, d = self.placeholder.size()
init_kvs = self.placeholder.new_zeros((b, h, sequence_length, d))
for layer_idx in layers_to_init:
self.layer_append(layer_idx, init_kvs, init_kvs)
def layer_get(self, layer_idx: int, zerofill: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
key_states = self.key_layer_cache.get(layer_idx, None)
value_states = self.value_layer_cache.get(layer_idx, None)
if zerofill:
if key_states is None:
key_states = self.placeholder
value_states = self.placeholder
else:
key_states = torch.cat([self.placeholder, key_states], dim=2)
value_states = torch.cat([self.placeholder, value_states], dim=2)
return key_states, value_states
def layer_set(self, layer_idx: int, key: torch.Tensor, value: torch.Tensor):
self.key_layer_cache[layer_idx] = key
self.value_layer_cache[layer_idx] = value
def layer_append(self, layer_idx: int, key: torch.Tensor, value: torch.Tensor):
if layer_idx not in self.key_layer_cache:
self.key_layer_cache[layer_idx] = key
self.value_layer_cache[layer_idx] = value
else:
self.key_layer_cache[layer_idx] = torch.cat([self.key_layer_cache[layer_idx], key], dim=2)
self.value_layer_cache[layer_idx] = torch.cat([self.value_layer_cache[layer_idx], value], dim=2)
class LayerIndexedCache(LayerCache, IndexedCache):
"""
A cache for storing the key-value pairs for layers, in combination with the ability of standard KV cache.
"""
def __init__(self) -> None:
LayerCache.__init__(self)
IndexedCache.__init__(self)
class LayerIndexedSinkCache(LayerCache, IndexedSinkCache):
"""
A cache for storing the key-value pairs for layers, in combination with the ability of sink KV cache.
"""
def __init__(self) -> None:
LayerCache.__init__(self)
IndexedSinkCache.__init__(self)
class LayerIndexedSlidingWindowCache(LayerCache, IndexedSlidingWindowCache):
"""
A cache for storing the key-value pairs for layers, in combination with the ability of sliding window KV cache.
"""
def __init__(self) -> None:
LayerCache.__init__(self)
IndexedSlidingWindowCache.__init__(self)
class LayerIndexedHybridCache(LayerCache, IndexedHybridCache):
"""
A cache for storing the key-value pairs for layers, in combination with the ability of hybrid KV cache.
"""
def __init__(self) -> None:
LayerCache.__init__(self)
IndexedHybridCache.__init__(self)
class AutoLayerCache(torch.nn.Module):
"""
AutoLayerCache is a module that automatically creates a cache from an existing cache.
"""
CACHE_MAPPING = {
DynamicCache: LayerIndexedCache,
SinkCache: LayerIndexedSinkCache,
IndexedSlidingWindowCache: LayerIndexedSlidingWindowCache,
IndexedHybridCache: LayerIndexedHybridCache,
}
def __init__(self, *args, **kwargs):
raise RuntimeError(
f"{self.__class__.__name__} is designed to be instantiated "
f"using the `{self.__class__.__name__}.from_cache(cache)` method."
)
@classmethod
def from_cache(cls, cache: Cache, *args, **kwargs):
"""
Create a new cache from an existing cache. The new cache will have the same type as the original cache.
"""
cache_type = type(cache)
if cache_type not in cls.CACHE_MAPPING:
raise ValueError(f"Cache type {cache_type} is not supported by {cls.__name__}.")
cache_class = cls.CACHE_MAPPING[cache_type]
if hasattr(cache_class, "from_cache"):
return cache_class.from_cache(cache, *args, **kwargs)
else:
# we init an empty cache and copy the attributes
new_cache = cache_class(*args, **kwargs)
new_cache.__dict__.update(cache.__dict__)
return new_cache