SLlama_v1_stochastic_t20 / checkpoint-48000 /modeling_sllama.py

Upload folder using huggingface_hub

8be4c74 verified 4 months ago

13.8 kB

	from typing import Callable, List, Optional, Tuple, Union
	import random

	import torch
	import torch.nn as nn
	from transformers.models.llama import LlamaForCausalLM, LlamaConfig
	from transformers.utils import logging
	from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS

	logger = logging.get_logger(__name__)

	class SharperLlamaForCausalLM(LlamaForCausalLM):

	def __init__(self, config: LlamaConfig):
	super().__init__(config)

	logger.info(f"\n* SLlama: {self.model}")

	for layer_i, layer in enumerate(self.model.layers):

	hidden_dim = config.hidden_size
	n_heads = config.num_attention_heads
	depth = layer_i # or pass 0

	layer.self_attn = SharperLlamaAttention(
	config, layer_idx=layer_i
	)

	class SharperLlamaAttention(nn.Module):
	"""Multi-headed attention from 'Attention Is All You Need' paper"""

	def __init__(self, config: LlamaConfig, layer_idx: int):
	super().__init__()
	self.config = config
	self.layer_idx = layer_idx
	self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
	self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
	self.scaling = self.head_dim**-0.5
	self.attention_dropout = config.attention_dropout
	self.is_causal = True

	self.q_proj = nn.Linear(
	config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
	)
	self.k_proj = nn.Linear(
	config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
	)
	self.v_proj = nn.Linear(
	config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
	)
	self.o_proj = nn.Linear(
	config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
	)

	if layer_idx < 2:
	self.temperature = nn.Parameter(torch.ones(config.num_attention_heads, 2048) / config.temperature_div) # num_heads, seq_len
	self.temperature_value = 1.0 / config.temperature_div
	print(f"[params] Layer {layer_idx} temperature: {self.temperature_value}")
	else:
	self.register_buffer("temperature", torch.ones(config.num_attention_heads, 2048))
	self.temperature_value = 1.0
	print(f"[buffer] Layer {layer_idx} temperature: {self.temperature.shape}")

	self.q_projs = None
	self.k_projs = None
	self.v_projs = None

	def forward(
	self,
	hidden_states: torch.Tensor,
	position_embeddings: Tuple[torch.Tensor, torch.Tensor],
	attention_mask: Optional[torch.Tensor],
	past_key_value = None,
	cache_position: Optional[torch.LongTensor] = None,
	**kwargs,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
	input_shape = hidden_states.shape[:-1]
	hidden_shape = (*input_shape, -1, self.head_dim)

	seq_len = input_shape[-1]
	temperature = self.temperature[:, :seq_len].unsqueeze(0).unsqueeze(-1) # (1, num_heads, seq_len, 1)

	query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) # (batch, num_heads, seq_len, head_dim)
	key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
	value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)

	cos, sin = position_embeddings
	query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

	if past_key_value is not None:
	# sin and cos are specific to RoPE models; cache_position needed for the static cache
	cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
	key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

	attention_interface: Callable = eager_attention_forward
	if self.config._attn_implementation != "eager":
	if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
	logger.warning_once(
	"`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
	'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
	)
	else:
	attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

	if random.random() < 0.8:
	query_states = query_states / (temperature + 1e-6) # pre-multiply temperature

	self.temperature_value = temperature.mean().item()
	self.q_projs = self.q_proj.weight.data.chunk(self.config.num_attention_heads, dim=0)
	self.q_projs = [torch.norm(q, p=2).cpu().item() for q in self.q_projs]
	self.k_projs = self.k_proj.weight.data.chunk(self.config.num_key_value_heads, dim=0)
	self.k_projs = [torch.norm(k, p=2).cpu().item() for k in self.k_projs]
	self.v_projs = self.v_proj.weight.data.chunk(self.config.num_key_value_heads, dim=0)
	self.v_projs = [torch.norm(v, p=2).cpu().item() for v in self.v_projs]

	attn_output, attn_weights = attention_interface(
	self,
	query_states,
	key_states,
	value_states,
	attention_mask,
	dropout=0.0 if not self.training else self.attention_dropout,
	scaling=self.scaling,
	**kwargs,
	)

	attn_output = attn_output.reshape(*input_shape, -1).contiguous()
	attn_output = self.o_proj(attn_output)
	return attn_output, attn_weights

	def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
	"""
	This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
	num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
	"""
	batch, num_key_value_heads, slen, head_dim = hidden_states.shape
	if n_rep == 1:
	return hidden_states
	hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
	return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


	def eager_attention_forward(
	module: nn.Module,
	query: torch.Tensor,
	key: torch.Tensor,
	value: torch.Tensor,
	attention_mask: Optional[torch.Tensor],
	scaling: float,
	dropout: float = 0.0,
	**kwargs,
	):
	key_states = repeat_kv(key, module.num_key_value_groups)
	value_states = repeat_kv(value, module.num_key_value_groups)

	temperature = kwargs.get("temperature", 1.0)

	attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling / temperature
	if attention_mask is not None:
	causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
	attn_weights = attn_weights + causal_mask

	attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
	attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
	attn_output = torch.matmul(attn_weights, value_states)
	attn_output = attn_output.transpose(1, 2).contiguous()

	return attn_output, attn_weights

	class LlamaRMSNorm(nn.Module):
	def __init__(self, hidden_size, eps=1e-6):
	"""
	LlamaRMSNorm is equivalent to T5LayerNorm
	"""
	super().__init__()
	self.weight = nn.Parameter(torch.ones(hidden_size))
	self.variance_epsilon = eps

	def forward(self, hidden_states):
	input_dtype = hidden_states.dtype
	hidden_states = hidden_states.to(torch.float32)
	variance = hidden_states.pow(2).mean(-1, keepdim=True)
	hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
	return self.weight * hidden_states.to(input_dtype)

	def extra_repr(self):
	return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"



	class LlamaRotaryEmbedding(nn.Module):
	def __init__(self, config: LlamaConfig, device=None):
	super().__init__()
	# BC: "rope_type" was originally "type"
	if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
	self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
	else:
	self.rope_type = "default"
	self.max_seq_len_cached = config.max_position_embeddings
	self.original_max_seq_len = config.max_position_embeddings

	self.config = config
	self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

	inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
	self.register_buffer("inv_freq", inv_freq, persistent=False)
	self.original_inv_freq = self.inv_freq

	def _dynamic_frequency_update(self, position_ids, device):
	"""
	dynamic RoPE layers should recompute `inv_freq` in the following situations:
	1 - growing beyond the cached sequence length (allow scaling)
	2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
	"""
	seq_len = torch.max(position_ids) + 1
	if seq_len > self.max_seq_len_cached: # growth
	inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
	self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
	self.max_seq_len_cached = seq_len

	if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
	# This .to() is needed if the model has been moved to a device after being initialized (because
	# the buffer is automatically moved, but not the original copy)
	self.original_inv_freq = self.original_inv_freq.to(device)
	self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
	self.max_seq_len_cached = self.original_max_seq_len

	@torch.no_grad()
	def forward(self, x, position_ids):
	if "dynamic" in self.rope_type:
	self._dynamic_frequency_update(position_ids, device=x.device)

	# Core RoPE block
	inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
	position_ids_expanded = position_ids[:, None, :].float()
	# Force float32 (see https://github.com/huggingface/transformers/pull/29285)
	device_type = x.device.type
	device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
	with torch.autocast(device_type=device_type, enabled=False):
	freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
	emb = torch.cat((freqs, freqs), dim=-1)
	cos = emb.cos()
	sin = emb.sin()

	# Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
	cos = cos * self.attention_scaling
	sin = sin * self.attention_scaling

	return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


	def rotate_half(x):
	"""Rotates half the hidden dims of the input."""
	x1 = x[..., : x.shape[-1] // 2]
	x2 = x[..., x.shape[-1] // 2 :]
	return torch.cat((-x2, x1), dim=-1)


	def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
	"""Applies Rotary Position Embedding to the query and key tensors.

	Args:
	q (`torch.Tensor`): The query tensor.
	k (`torch.Tensor`): The key tensor.
	cos (`torch.Tensor`): The cosine part of the rotary embedding.
	sin (`torch.Tensor`): The sine part of the rotary embedding.
	position_ids (`torch.Tensor`, optional):
	Deprecated and unused.
	unsqueeze_dim (`int`, optional, defaults to 1):
	The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
	sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
	that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
	k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
	cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
	the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
	Returns:
	`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
	"""
	cos = cos.unsqueeze(unsqueeze_dim)
	sin = sin.unsqueeze(unsqueeze_dim)
	q_embed = (q * cos) + (rotate_half(q) * sin)
	k_embed = (k * cos) + (rotate_half(k) * sin)
	return q_embed, k_embed


	class LlamaMLP(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.hidden_size = config.hidden_size
	self.intermediate_size = config.intermediate_size
	self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
	self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
	self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
	self.act_fn = ACT2FN[config.hidden_act]

	def forward(self, x):
	down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
	return down_proj