Spaces:

zhangtao-whu
/

OMG-LLaVA

Runtime error

App Files Files Community

OMG-LLaVA / xtuner /model /modules /dispatch /phi3.py

zhangtao-whu

Upload folder using huggingface_hub

476ac07 verified 12 months ago

raw

history blame contribute delete

18.9 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	import warnings
	from typing import Optional, Tuple

	import torch
	import torch.distributed as dist
	from mmengine import MessageHub

	from xtuner.parallel.sequence import (get_sequence_parallel_world_size,
	post_process_for_sequence_parallel_attn,
	pre_process_for_sequence_parallel_attn)
	from .attention import flash_attn_wo_mask, varlen_flash_attn

	try:
	from transformers.cache_utils import Cache
	except ImportError:

	class Cache:
	pass


	import inspect

	_flash_supports_window_size = False
	try:
	from flash_attn import flash_attn_func

	_flash_supports_window_size = 'window_size' in list(
	inspect.signature(flash_attn_func).parameters)

	if not _flash_supports_window_size:
	raise ValueError(
	'Please update flash-attention to support window size.')
	# else:
	except ImportError:
	pass


	# Copied from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/3a811845d89f3c1b3f41b341d0f9f05104769f35/modeling_phi3.py#L302 # noqa:E501
	def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
	"""This is the equivalent of torch.repeat_interleave(x, dim=1,
	repeats=n_rep).

	The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to
	(batch, num_attention_heads, seqlen, head_dim)
	"""
	batch, num_key_value_heads, slen, head_dim = hidden_states.shape
	if n_rep == 1:
	return hidden_states
	hidden_states = hidden_states[:, :,
	None, :, :].expand(batch,
	num_key_value_heads,
	n_rep, slen, head_dim)
	return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen,
	head_dim)


	# https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/3a811845d89f3c1b3f41b341d0f9f05104769f35/modeling_phi3.py#L247 # noqa:E501
	def rotate_half(x):
	"""Rotates half the hidden dims of the input."""
	x1 = x[..., :x.shape[-1] // 2]
	x2 = x[..., x.shape[-1] // 2:]
	return torch.cat((-x2, x1), dim=-1)


	# Copied from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/3a811845d89f3c1b3f41b341d0f9f05104769f35/modeling_phi3.py#L255 # noqa:E501
	def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
	"""Applies Rotary Position Embedding to the query and key tensors.

	Args:
	q (`torch.Tensor`): The query tensor.
	k (`torch.Tensor`): The key tensor.
	cos (`torch.Tensor`): The cosine part of the rotary embedding.
	sin (`torch.Tensor`): The sine part of the rotary embedding.
	position_ids (`torch.Tensor`, optional):
	Deprecated and unused.
	unsqueeze_dim (`int`, optional, defaults to 1):
	The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
	sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
	that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
	k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
	cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
	the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
	Returns:
	`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
	""" # noqa:E501
	cos = cos.unsqueeze(unsqueeze_dim)
	sin = sin.unsqueeze(unsqueeze_dim)
	q_embed = (q * cos) + (rotate_half(q) * sin)
	k_embed = (k * cos) + (rotate_half(k) * sin)
	return q_embed, k_embed


	def phi3_attn_forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.LongTensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Cache] = None,
	output_attentions: bool = False,
	use_cache: bool = False,
	cache_position: Optional[torch.LongTensor] = None,
	**kwargs,
	):
	if not _flash_supports_window_size:
	raise ValueError(
	'The current flash attention version does not support '
	'sliding window attention.')

	output_attentions = False

	if 'padding_mask' in kwargs:
	warnings.warn(
	'Passing `padding_mask` is deprecated and will be removed in '
	'v4.37. Please make sure use `attention_mask` instead.`')

	# overwrite attention_mask with padding_mask
	attention_mask = kwargs.pop('padding_mask')

	bsz, q_len, _ = hidden_states.size()

	qkv = self.qkv_proj(hidden_states)
	query_pos = self.num_heads * self.head_dim
	query_states = qkv[..., :query_pos]
	key_states = qkv[..., query_pos:query_pos +
	self.num_key_value_heads * self.head_dim]
	value_states = qkv[...,
	query_pos + self.num_key_value_heads * self.head_dim:]

	# Flash attention requires the input to have the shape
	# batch_size x seq_length x head_dim x hidden_dim
	# therefore we just need to keep the original shape
	query_states = query_states.view(bsz, q_len, self.num_heads,
	self.head_dim).transpose(1, 2)
	key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
	self.head_dim).transpose(1, 2)
	value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
	self.head_dim).transpose(1, 2)

	kv_seq_len = key_states.shape[-2]
	if past_key_value is not None:
	if self.layer_idx is None:
	raise ValueError(
	'The cache structure has changed since version v4.36. '
	f'If you are using {self.__class__.__name__} '
	'for auto-regressive decoding with k/v caching, '
	'please make sure to initialize the attention class '
	'with a layer index.')
	kv_seq_len += past_key_value.get_usable_length(kv_seq_len,
	self.layer_idx)

	rotary_seq_len = max(kv_seq_len, position_ids.max().item() + 1)
	cos, sin = self.rotary_emb(
	value_states, position_ids, seq_len=rotary_seq_len)

	query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
	cos, sin, position_ids)

	use_sliding_windows = (
	_flash_supports_window_size
	and getattr(self.config, 'sliding_window', None) is not None
	and kv_seq_len > self.config.sliding_window)

	if past_key_value is not None:
	# Activate slicing cache only if the config has a value
	# `sliding_windows` attribute
	cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
	if (getattr(self.config, 'sliding_window', None) is not None
	and kv_seq_len > self.config.sliding_window
	and cache_has_contents):
	slicing_tokens = 1 - self.config.sliding_window

	past_key = past_key_value[self.layer_idx][0]
	past_value = past_key_value[self.layer_idx][1]

	past_key = past_key[:, :, slicing_tokens:, :].contiguous()
	past_value = past_value[:, :, slicing_tokens:, :].contiguous()

	if past_key.shape[-2] != self.config.sliding_window - 1:
	raise ValueError(
	'past key must have a shape of (`batch_size, num_heads, '
	'self.config.sliding_window-1, head_dim`), got'
	f' {past_key.shape}')

	if attention_mask is not None:
	attention_mask = attention_mask[:, slicing_tokens:]
	attention_mask = torch.cat(
	[attention_mask,
	torch.ones_like(attention_mask[:, -1:])],
	dim=-1)

	cache_kwargs = {'sin': sin, 'cos': cos} # Specific to RoPE models
	key_states, value_states = past_key_value.update(
	key_states, value_states, self.layer_idx, cache_kwargs)

	# repeat k/v heads if n_kv_heads < n_heads
	key_states = repeat_kv(key_states, self.num_key_value_groups)
	value_states = repeat_kv(value_states, self.num_key_value_groups)

	attn_dropout = self.attention_dropout if self.training else 0.0

	# In PEFT, usually we cast the layer norms in float32 for training
	# stability reasons therefore the input hidden states gets silently
	# casted in float32. Hence, we need cast them back in the correct dtype
	# just to be sure everything works as expected.
	# This might slowdown training & inference so it is recommended to not
	# cast the LayerNorms in fp32.

	if query_states.dtype == torch.float32:
	if torch.is_autocast_enabled():
	target_dtype = torch.get_autocast_gpu_dtype()
	# Handle the case where the model is quantized
	elif hasattr(self.config, '_pre_quantization_dtype'):
	target_dtype = self.config._pre_quantization_dtype
	else:
	target_dtype = self.qkv_proj.weight.dtype

	query_states = query_states.to(target_dtype)
	key_states = key_states.to(target_dtype)
	value_states = value_states.to(target_dtype)

	# Reashape to the expected shape for Flash Attention
	query_states = query_states.transpose(1, 2)
	key_states = key_states.transpose(1, 2)
	value_states = value_states.transpose(1, 2)

	enable_sequence_parallel = (
	dist.is_initialized() and get_sequence_parallel_world_size() > 1
	and self.training)
	if enable_sequence_parallel:
	# (b, s // sp_world_size, nd, dim) -> (b, s, nd // sp_world_size, dim)
	query_states, key_states, value_states = \
	pre_process_for_sequence_parallel_attn(
	query_states, key_states, value_states,
	scatter_dim=2, gather_dim=1)

	attn_output = self._flash_attention_forward(
	query_states,
	key_states,
	value_states,
	attention_mask,
	query_states.shape[1],
	dropout=attn_dropout,
	use_sliding_windows=use_sliding_windows,
	)

	if enable_sequence_parallel:
	# (b, s, nd // sp_world_size, dim) -> (b, s // sp_world_size, nd, dim)
	attn_output = post_process_for_sequence_parallel_attn(
	attn_output, scatter_dim=1, gather_dim=2)

	attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
	attn_output = self.o_proj(attn_output)

	if not output_attentions:
	attn_weights = None

	return attn_output, attn_weights, past_key_value


	def phi3_varlen_attn_forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Cache] = None,
	output_attentions: bool = False,
	use_cache: bool = False,
	cache_position: Optional[torch.LongTensor] = None,
	**kwargs,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor],
	Optional[Tuple[torch.Tensor]]]:
	if not _flash_supports_window_size:
	raise ValueError(
	'The current flash attention version does not support '
	'sliding window attention.')

	output_attentions = False

	is_training = self.training

	message_hub = MessageHub.get_instance('varlen_attn_args')
	rank = dist.get_rank()
	cumulative_len = message_hub.get_info(f'cumulative_len_rank_{rank}')
	max_seqlen = message_hub.get_info(f'max_seqlen_rank_{rank}')

	assert is_training == (past_key_value is None)
	use_varlen_atten = (cumulative_len is not None)

	if 'padding_mask' in kwargs:
	warnings.warn(
	'Passing `padding_mask` is deprecated and will be removed in v4.37'
	' Please make sure use `attention_mask` instead.`')

	# overwrite attention_mask with padding_mask
	attention_mask = kwargs.pop('padding_mask')

	bsz, q_len, _ = hidden_states.size()
	assert bsz == 1, (f'If utilizing local attention, the batch size should be'
	f' set to 1, but got {bsz}')
	# attention_mask is set to None if no padding token in input_ids
	# varlen attn need data packing so no padding tokens in input_ids
	assert attention_mask is None

	qkv = self.qkv_proj(hidden_states)
	query_pos = self.num_heads * self.head_dim
	query_states = qkv[..., :query_pos]
	key_states = qkv[..., query_pos:query_pos +
	self.num_key_value_heads * self.head_dim]
	value_states = qkv[...,
	query_pos + self.num_key_value_heads * self.head_dim:]

	# Flash attention requires the input to have the shape
	# batch_size x seq_length x head_dim x hidden_dim
	# therefore we just need to keep the original shape
	query_states = query_states.view(bsz, q_len, self.num_heads,
	self.head_dim).transpose(1, 2)
	key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
	self.head_dim).transpose(1, 2)
	value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
	self.head_dim).transpose(1, 2)

	kv_seq_len = key_states.shape[-2]
	if past_key_value is not None:
	if self.layer_idx is None:
	raise ValueError(
	'The cache structure has changed since version v4.36. '
	f'If you are using {self.__class__.__name__} '
	'for auto-regressive decoding with k/v caching, '
	'please make sure to initialize the attention class '
	'with a layer index.')
	kv_seq_len += past_key_value.get_usable_length(kv_seq_len,
	self.layer_idx)

	assert position_ids is not None
	rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
	cos, sin = self.rotary_emb(
	value_states, position_ids, seq_len=rotary_seq_len)

	query_states, key_states = apply_rotary_pos_emb(query_states, key_states,
	cos, sin, position_ids)

	use_sliding_windows = (
	_flash_supports_window_size
	and getattr(self.config, 'sliding_window', None) is not None
	and kv_seq_len > self.config.sliding_window)

	if past_key_value is not None:
	# Activate slicing cache only if the config has a value
	# `sliding_windows` attribute
	cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
	if (getattr(self.config, 'sliding_window', None) is not None
	and kv_seq_len > self.config.sliding_window
	and cache_has_contents):
	slicing_tokens = 1 - self.config.sliding_window

	past_key = past_key_value[self.layer_idx][0]
	past_value = past_key_value[self.layer_idx][1]

	past_key = past_key[:, :, slicing_tokens:, :].contiguous()
	past_value = past_value[:, :, slicing_tokens:, :].contiguous()

	if past_key.shape[-2] != self.config.sliding_window - 1:
	raise ValueError(
	'past key must have a shape of (`batch_size, num_heads, '
	'self.config.sliding_window-1, head_dim`), got'
	f' {past_key.shape}')

	if attention_mask is not None:
	attention_mask = attention_mask[:, slicing_tokens:]
	attention_mask = torch.cat(
	[attention_mask,
	torch.ones_like(attention_mask[:, -1:])],
	dim=-1)

	cache_kwargs = {'sin': sin, 'cos': cos} # Specific to RoPE models
	key_states, value_states = past_key_value.update(
	key_states, value_states, self.layer_idx, cache_kwargs)

	# repeat k/v heads if n_kv_heads < n_heads
	key_states = repeat_kv(key_states, self.num_key_value_groups)
	value_states = repeat_kv(value_states, self.num_key_value_groups)

	# In PEFT, usually we cast the layer norms in float32 for
	# training stability reasons, therefore the input hidden states gets
	# silently casted in float32. Hence, we need
	# cast them back in float16 just to be sure everything works as expected.

	if query_states.dtype == torch.float32:
	if torch.is_autocast_enabled():
	target_dtype = torch.get_autocast_gpu_dtype()
	# Handle the case where the model is quantized
	elif hasattr(self.config, '_pre_quantization_dtype'):
	target_dtype = self.config._pre_quantization_dtype
	else:
	target_dtype = self.qkv_proj.weight.dtype

	query_states = query_states.to(target_dtype)
	key_states = key_states.to(target_dtype)
	value_states = value_states.to(target_dtype)

	# Reashape to the expected shape for Flash Attention
	query_states = query_states.transpose(1, 2)
	key_states = key_states.transpose(1, 2)
	value_states = value_states.transpose(1, 2)

	# ----------------- flash attention forward ------------------------#

	if not self._flash_attn_uses_top_left_mask:
	causal = self.is_causal
	else:
	causal = self.is_causal and q_len != 1

	use_sliding_windows = (
	_flash_supports_window_size
	and getattr(self.config, 'sliding_window', None) is not None
	and kv_seq_len > self.config.sliding_window)

	window_size = (self.config.sliding_window,
	self.config.sliding_window) if use_sliding_windows else (-1,
	-1)
	attn_dropout = self.attention_dropout if self.training else 0.0

	if use_varlen_atten:
	attn_output = varlen_flash_attn(
	query_states,
	key_states,
	value_states,
	cumulative_len,
	max_seqlen,
	causal=causal,
	dropout_p=attn_dropout,
	window_size=window_size,
	training=self.training)
	else:
	attn_output = flash_attn_wo_mask(
	query_states,
	key_states,
	value_states,
	causal=causal,
	dropout_p=attn_dropout,
	window_size=window_size,
	training=self.training)

	# ---------------- flash attention forward end ------------------- #

	attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
	attn_output = self.o_proj(attn_output)

	if not output_attentions:
	attn_weights = None

	return attn_output, attn_weights, past_key_value