Spaces:
Sleeping
Sleeping
from abc import ABC, abstractmethod | |
from typing import Any, Dict, List, Optional | |
import torch | |
import torch.nn.functional as F | |
from torch.nn.parameter import Parameter | |
from vllm.model_executor.parallel_utils.parallel_state import ( | |
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) | |
from vllm.model_executor.parallel_utils.communication_op import ( | |
tensor_model_parallel_all_reduce, tensor_model_parallel_all_gather) | |
from vllm.model_executor.parallel_utils.utils import ( | |
divide, split_tensor_along_last_dim) | |
from vllm.model_executor.utils import set_weight_attrs | |
from vllm.logger import init_logger | |
logger = init_logger(__name__) | |
class LinearMethodBase(ABC): | |
"""Base class for different (maybe quantized) linear methods.""" | |
def create_weights(self, input_size_per_partition: int, | |
output_size_per_partition: int, input_size: int, | |
output_size: int, | |
params_dtype: torch.dtype) -> Dict[str, Any]: | |
"""Create weights for a linear layer.""" | |
raise NotImplementedError | |
def apply_weights(self, | |
weights: Dict[str, torch.Tensor], | |
x: torch.Tensor, | |
bias: Optional[torch.Tensor] = None) -> torch.Tensor: | |
"""Apply the weights to the input tensor.""" | |
raise NotImplementedError | |
class UnquantizedLinearMethod(LinearMethodBase): | |
"""Linear method without quantization. | |
Args: | |
separate_bias_add: If true, add bias separately after matrix | |
multiplication. | |
""" | |
def __init__(self, separate_bias_add: bool = False): | |
self.separate_bias_add = separate_bias_add | |
def create_weights(self, input_size_per_partition: int, | |
output_size_per_partition: int, input_size: int, | |
output_size: int, | |
params_dtype: torch.dtype) -> Dict[str, Any]: | |
weight = Parameter(torch.empty(output_size_per_partition, | |
input_size_per_partition, | |
device=torch.cuda.current_device(), | |
dtype=params_dtype), | |
requires_grad=False) | |
set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) | |
return {"weight": weight} | |
def apply_weights(self, | |
weights: Dict[str, torch.Tensor], | |
x: torch.Tensor, | |
bias: Optional[torch.Tensor] = None) -> torch.Tensor: | |
weight = weights["weight"] | |
if self.separate_bias_add: | |
if bias: | |
return F.linear(x, weight) + bias | |
return F.linear(x, weight) | |
return F.linear(x, weight, bias) | |
class ReplicatedLinear(torch.nn.Module): | |
"""Replicated linear layer. | |
Args: | |
input_size: input dimension of the linear layer. | |
output_size: output dimension of the linear layer. | |
bias: If true, add bias. | |
skip_bias_add: If true, skip adding bias but instead return it. | |
params_dtype: Data type for the parameters. | |
linear_method: (Maybe quantized) linear method. | |
""" | |
def __init__( | |
self, | |
input_size: int, | |
output_size: int, | |
bias: bool = True, | |
skip_bias_add: bool = False, | |
params_dtype: Optional[torch.dtype] = None, | |
linear_method: Optional[LinearMethodBase] = None, | |
): | |
super().__init__() | |
# Keep input parameters | |
self.input_size = input_size | |
self.output_size = output_size | |
self.skip_bias_add = skip_bias_add | |
if params_dtype is None: | |
params_dtype = torch.get_default_dtype() | |
self.params_dtype = params_dtype | |
if linear_method is None: | |
linear_method = UnquantizedLinearMethod() | |
self.linear_method = linear_method | |
self.linear_weights = self.linear_method.create_weights( | |
self.input_size, self.output_size, self.input_size, | |
self.output_size, self.params_dtype) | |
for name, weight in self.linear_weights.items(): | |
if isinstance(weight, torch.Tensor): | |
self.register_parameter(name, weight) | |
if bias: | |
self.bias = Parameter( | |
torch.empty(self.output_size, | |
device=torch.cuda.current_device(), | |
dtype=self.params_dtype)) | |
set_weight_attrs(self.bias, {"output_dim": 0}) | |
else: | |
self.register_parameter("bias", None) | |
def forward(self, x: torch.Tensor) -> torch.Tensor: | |
bias = self.bias if not self.skip_bias_add else None | |
output = self.linear_method.apply_weights(self.linear_weights, x, bias) | |
output_bias = self.bias if self.skip_bias_add else None | |
return output, output_bias | |
class ColumnParallelLinear(torch.nn.Module): | |
"""Linear layer with column parallelism. | |
The linear layer is defined as Y = XA + b. A is parallelized along | |
its second dimension as A = [A_1, ..., A_p]. | |
Args: | |
input_size: first dimension of matrix A. | |
output_size: second dimension of matrix A. | |
bias: If true, add bias. | |
gather_output: If true, call all-gather on output and make Y available | |
to all GPUs, otherwise, every GPU will have its output | |
which is Y_i = XA_i | |
skip_bias_add: This was added to enable performance optimizations where | |
bias can be fused with other element-wise operations. we | |
skip adding bias but instead return it. | |
params_dtype: Data type for the parameters. | |
linear_method: (Maybe quantized) linear method. | |
""" | |
def __init__( | |
self, | |
input_size: int, | |
output_size: int, | |
bias: bool = True, | |
gather_output: bool = False, | |
skip_bias_add: bool = False, | |
params_dtype: Optional[torch.dtype] = None, | |
linear_method: Optional[LinearMethodBase] = None, | |
): | |
super().__init__() | |
# Keep input parameters | |
self.input_size = input_size | |
self.output_size = output_size | |
self.gather_output = gather_output | |
# Divide the weight matrix along the last dimension. | |
tp_size = get_tensor_model_parallel_world_size() | |
self.output_size_per_partition = divide(output_size, tp_size) | |
self.skip_bias_add = skip_bias_add | |
if params_dtype is None: | |
params_dtype = torch.get_default_dtype() | |
self.params_dtype = params_dtype | |
if linear_method is None: | |
linear_method = UnquantizedLinearMethod() | |
self.linear_method = linear_method | |
self.linear_weights = self.linear_method.create_weights( | |
self.input_size, self.output_size_per_partition, self.input_size, | |
self.output_size, self.params_dtype) | |
for name, weight in self.linear_weights.items(): | |
if isinstance(weight, torch.Tensor): | |
self.register_parameter(name, weight) | |
set_weight_attrs(weight, {"weight_loader": self.weight_loader}) | |
if bias: | |
self.bias = Parameter( | |
torch.empty(self.output_size_per_partition, | |
device=torch.cuda.current_device(), | |
dtype=params_dtype)) | |
set_weight_attrs(self.bias, { | |
"output_dim": 0, | |
"weight_loader": self.weight_loader, | |
}) | |
else: | |
self.register_parameter("bias", None) | |
def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): | |
tp_rank = get_tensor_model_parallel_rank() | |
output_dim = getattr(param, "output_dim", None) | |
param_data = param.data | |
if output_dim is not None: | |
shard_size = param_data.shape[output_dim] | |
start_idx = tp_rank * shard_size | |
loaded_weight = loaded_weight.narrow(output_dim, start_idx, | |
shard_size) | |
assert param_data.shape == loaded_weight.shape | |
param_data.copy_(loaded_weight) | |
def forward(self, input_): | |
bias = self.bias if not self.skip_bias_add else None | |
# Matrix multiply. | |
output_parallel = self.linear_method.apply_weights( | |
self.linear_weights, input_, bias) | |
if self.gather_output: | |
# All-gather across the partitions. | |
output = tensor_model_parallel_all_gather(output_parallel) | |
else: | |
output = output_parallel | |
output_bias = self.bias if self.skip_bias_add else None | |
return output, output_bias | |
class MergedColumnParallelLinear(ColumnParallelLinear): | |
"""Packed linear layers with column parallelism. | |
Similar to ColumnParallelLinear, but the weight matrix is concatenated | |
along the output dimension. When the weight matrix is loaded, the | |
different partitions are sharded separately. | |
Args: | |
input_size: input dimension of the linear layer. | |
output_sizes: list of output dimensions of the linear layer. | |
bias: If true, add bias. | |
gather_output: If true, call all-gather on output and make the output | |
available to all GPUs, otherwise, every GPU will have | |
its own output. | |
skip_bias_add: This was added to enable performance optimizations where | |
bias can be fused with other element-wise operations. we | |
skip adding bias but instead return it. | |
params_dtype: Data type for the parameters. | |
linear_method: (Maybe quantized) linear method. | |
""" | |
def __init__( | |
self, | |
input_size: int, | |
output_sizes: List[int], | |
bias: bool = True, | |
gather_output: bool = False, | |
skip_bias_add: bool = False, | |
params_dtype: Optional[torch.dtype] = None, | |
linear_method: Optional[LinearMethodBase] = None, | |
): | |
self.output_sizes = output_sizes | |
tp_size = get_tensor_model_parallel_world_size() | |
assert all(output_size % tp_size == 0 for output_size in output_sizes) | |
super().__init__(input_size, sum(output_sizes), bias, gather_output, | |
skip_bias_add, params_dtype, linear_method) | |
def weight_loader(self, | |
param: Parameter, | |
loaded_weight: torch.Tensor, | |
loaded_shard_id: Optional[int] = None): | |
param_data = param.data | |
output_dim = getattr(param, "output_dim", None) | |
if loaded_shard_id is None: | |
# Loaded weight is already packed. | |
if output_dim is None: | |
assert param_data.shape == loaded_weight.shape | |
param_data.copy_(loaded_weight) | |
return | |
current_shard_offset = 0 | |
shard_offsets = [] | |
for i, output_size in enumerate(self.output_sizes): | |
shard_offsets.append((i, current_shard_offset, output_size)) | |
current_shard_offset += output_size | |
packed_dim = getattr(param, "packed_dim", None) | |
for shard_id, shard_offset, shard_size in shard_offsets: | |
# If quantized, we need to adjust the offset and size to account | |
# for the packing. | |
if packed_dim == output_dim: | |
shard_size = shard_size // param.pack_factor | |
shard_offset = shard_offset // param.pack_factor | |
loaded_weight_shard = loaded_weight.narrow( | |
output_dim, shard_offset, shard_size) | |
self.weight_loader(param, loaded_weight_shard, shard_id) | |
return | |
assert loaded_shard_id < len(self.output_sizes) | |
tp_rank = get_tensor_model_parallel_rank() | |
tp_size = get_tensor_model_parallel_world_size() | |
if output_dim is not None: | |
shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size | |
shard_size = self.output_sizes[loaded_shard_id] // tp_size | |
# If quantized, we need to adjust the offset and size to account | |
# for the packing. | |
packed_dim = getattr(param, "packed_dim", None) | |
if packed_dim == output_dim: | |
shard_size = shard_size // param.pack_factor | |
shard_offset = shard_offset // param.pack_factor | |
param_data = param_data.narrow(output_dim, shard_offset, | |
shard_size) | |
start_idx = tp_rank * shard_size | |
loaded_weight = loaded_weight.narrow(output_dim, start_idx, | |
shard_size) | |
else: | |
ignore_warning = getattr(param, "ignore_warning", False) | |
if not ignore_warning: | |
logger.warning( | |
"Loading a weight without `output_dim` attribute in " | |
"MergedColumnParallelLinear, assume the weight is " | |
"the same for all partitions.") | |
assert param_data.shape == loaded_weight.shape | |
param_data.copy_(loaded_weight) | |
class QKVParallelLinear(ColumnParallelLinear): | |
"""Linear layers for the attention's QKV transformation. | |
Linear layers for the linear transformation of the query, key, and value | |
vectors in the attention layer. The weight matrix is concatenated along | |
the output dimension. The layer is parallelized along the head dimension. | |
When the number of key/value heads is smaller than the number of query | |
heads (e.g., multi-query/grouped-query attention), the key/value head may | |
be replicated while the query heads are partitioned. | |
Args: | |
hidden_size: input hidden state size of the transformer. | |
head_size: size of each attention head. | |
total_num_heads: total number of attention query heads. | |
total_num_kv_heads: total number of attention key/value heads. If | |
None, assume total_num_kv_heads = total_num_heads. | |
bias: If true, add bias. | |
skip_bias_add: This was added to enable performance optimizations where | |
bias can be fused with other element-wise operations. we | |
skip adding bias but instead return it. | |
params_dtype: Data type for the parameters. | |
linear_method: (Maybe quantized) linear method. | |
""" | |
def __init__( | |
self, | |
hidden_size: int, | |
head_size: int, | |
total_num_heads: int, | |
total_num_kv_heads: Optional[int] = None, | |
bias: bool = True, | |
skip_bias_add: bool = False, | |
params_dtype: Optional[torch.dtype] = None, | |
linear_method: Optional[LinearMethodBase] = None, | |
): | |
self.hidden_size = hidden_size | |
self.head_size = head_size | |
self.total_num_heads = total_num_heads | |
if total_num_kv_heads is None: | |
total_num_kv_heads = total_num_heads | |
self.total_num_kv_heads = total_num_kv_heads | |
# Divide the weight matrix along the last dimension. | |
tp_size = get_tensor_model_parallel_world_size() | |
self.num_heads = divide(self.total_num_heads, tp_size) | |
if tp_size >= self.total_num_kv_heads: | |
self.num_kv_heads = 1 | |
self.num_kv_head_replicas = divide(tp_size, | |
self.total_num_kv_heads) | |
else: | |
self.num_kv_heads = divide(self.total_num_kv_heads, tp_size) | |
self.num_kv_head_replicas = 1 | |
input_size = self.hidden_size | |
output_size = (self.num_heads + | |
2 * self.num_kv_heads) * tp_size * self.head_size | |
super().__init__(input_size, output_size, bias, False, skip_bias_add, | |
params_dtype, linear_method) | |
def weight_loader(self, | |
param: Parameter, | |
loaded_weight: torch.Tensor, | |
loaded_shard_id: Optional[str] = None): | |
param_data = param.data | |
output_dim = getattr(param, "output_dim", None) | |
if loaded_shard_id is None: | |
# Loaded weight is already packed. | |
if output_dim is None: | |
assert param_data.shape == loaded_weight.shape | |
param_data.copy_(loaded_weight) | |
return | |
shard_offsets = [ | |
# (shard_id, shard_offset, shard_size) | |
("q", 0, self.total_num_heads * self.head_size), | |
("k", self.total_num_heads * self.head_size, | |
self.total_num_kv_heads * self.head_size), | |
("v", (self.total_num_heads + self.total_num_kv_heads) * | |
self.head_size, self.total_num_kv_heads * self.head_size), | |
] | |
packed_dim = getattr(param, "packed_dim", None) | |
for shard_id, shard_offset, shard_size in shard_offsets: | |
# If quantized, we need to adjust the offset and size to account | |
# for the packing. | |
if packed_dim == output_dim: | |
shard_size = shard_size // param.pack_factor | |
shard_offset = shard_offset // param.pack_factor | |
loaded_weight_shard = loaded_weight.narrow( | |
output_dim, shard_offset, shard_size) | |
self.weight_loader(param, loaded_weight_shard, shard_id) | |
return | |
tp_rank = get_tensor_model_parallel_rank() | |
assert loaded_shard_id in ["q", "k", "v"] | |
if output_dim is not None: | |
if loaded_shard_id == "q": | |
shard_offset = 0 | |
shard_size = self.num_heads * self.head_size | |
elif loaded_shard_id == "k": | |
shard_offset = self.num_heads * self.head_size | |
shard_size = self.num_kv_heads * self.head_size | |
elif loaded_shard_id == "v": | |
shard_offset = (self.num_heads + | |
self.num_kv_heads) * self.head_size | |
shard_size = self.num_kv_heads * self.head_size | |
# If quantized, we need to adjust the offset and size to account | |
# for the packing. | |
packed_dim = getattr(param, "packed_dim", None) | |
if packed_dim == output_dim: | |
shard_size = shard_size // param.pack_factor | |
shard_offset = shard_offset // param.pack_factor | |
param_data = param_data.narrow(output_dim, shard_offset, | |
shard_size) | |
if loaded_shard_id == "q": | |
shard_id = tp_rank | |
else: | |
shard_id = tp_rank // self.num_kv_head_replicas | |
start_idx = shard_id * shard_size | |
loaded_weight = loaded_weight.narrow(output_dim, start_idx, | |
shard_size) | |
else: | |
ignore_warning = getattr(param, "ignore_warning", False) | |
if not ignore_warning: | |
logger.warning( | |
"Loading a weight without `output_dim` attribute in " | |
"QKVParallelLinear, assume the weight is the same " | |
"for all partitions.") | |
assert param_data.shape == loaded_weight.shape | |
param_data.copy_(loaded_weight) | |
class RowParallelLinear(torch.nn.Module): | |
"""Linear layer with row parallelism. | |
The linear layer is defined as Y = XA + b. A is parallelized along | |
its first dimension and X along its second dimension as: | |
- - | |
| A_1 | | |
| . | | |
A = | . | X = [X_1, ..., X_p] | |
| . | | |
| A_p | | |
- - | |
Arguments: | |
input_size: first dimension of matrix A. | |
output_size: second dimension of matrix A. | |
bias: If true, add bias. Note that bias is not parallelized. | |
input_is_parallel: If true, we assume that the input is already | |
split across the GPUs and we do not split | |
again. | |
skip_bias_add: This was added to enable performance optimization where | |
bias can be fused with other element-wise operations. | |
We skip adding bias but instead return it. | |
params_dtype: Data type for the parameters. | |
linear_method: (Maybe quantized) linear method. | |
""" | |
def __init__( | |
self, | |
input_size: int, | |
output_size: int, | |
bias: bool = True, | |
input_is_parallel: bool = True, | |
skip_bias_add: bool = False, | |
params_dtype: Optional[torch.dtype] = None, | |
reduce_results: bool = True, | |
linear_method: Optional[LinearMethodBase] = None, | |
): | |
super().__init__() | |
# Keep input parameters | |
self.input_size = input_size | |
self.output_size = output_size | |
self.input_is_parallel = input_is_parallel | |
self.reduce_results = reduce_results | |
if params_dtype is None: | |
params_dtype = torch.get_default_dtype() | |
self.params_dtype = params_dtype | |
# Divide the weight matrix along the last dimension. | |
self.tp_size = get_tensor_model_parallel_world_size() | |
self.input_size_per_partition = divide(input_size, self.tp_size) | |
self.skip_bias_add = skip_bias_add | |
if linear_method is None: | |
linear_method = UnquantizedLinearMethod() | |
self.linear_method = linear_method | |
self.linear_weights = self.linear_method.create_weights( | |
self.input_size_per_partition, self.output_size, self.input_size, | |
self.output_size, self.params_dtype) | |
for name, weight in self.linear_weights.items(): | |
if isinstance(weight, torch.Tensor): | |
self.register_parameter(name, weight) | |
set_weight_attrs(weight, {"weight_loader": self.weight_loader}) | |
if not reduce_results and (bias and not skip_bias_add): | |
raise ValueError("When not reduce the results, adding bias to the " | |
"results can lead to incorrect results") | |
if bias: | |
self.bias = Parameter( | |
torch.empty(self.output_size, | |
device=torch.cuda.current_device(), | |
dtype=params_dtype)) | |
set_weight_attrs(self.bias, { | |
"output_dim": 0, | |
"weight_loader": self.weight_loader, | |
}) | |
else: | |
self.register_parameter("bias", None) | |
def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): | |
tp_rank = get_tensor_model_parallel_rank() | |
input_dim = getattr(param, "input_dim", None) | |
param_data = param.data | |
if input_dim is not None: | |
shard_size = param_data.shape[input_dim] | |
start_idx = tp_rank * shard_size | |
loaded_weight = loaded_weight.narrow(input_dim, start_idx, | |
shard_size) | |
assert param_data.shape == loaded_weight.shape | |
param_data.copy_(loaded_weight) | |
def forward(self, input_): | |
# Set up backprop all-reduce. | |
if self.input_is_parallel: | |
input_parallel = input_ | |
else: | |
tp_rank = get_tensor_model_parallel_rank() | |
splitted_input = split_tensor_along_last_dim( | |
input_, num_partitions=self.tp_size) | |
input_parallel = splitted_input[tp_rank].contiguous() | |
# Matrix multiply. | |
output_parallel = self.linear_method.apply_weights( | |
self.linear_weights, input_parallel) | |
if self.reduce_results and self.tp_size > 1: | |
output_ = tensor_model_parallel_all_reduce(output_parallel) | |
else: | |
output_ = output_parallel | |
if not self.skip_bias_add: | |
output = output_ + self.bias if self.bias is not None else output_ | |
output_bias = None | |
else: | |
output = output_ | |
output_bias = self.bias | |
return output, output_bias | |