Spaces:
Sleeping
Sleeping
import argparse | |
import dataclasses | |
from dataclasses import dataclass | |
from typing import Optional, Tuple | |
from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, | |
SchedulerConfig, LoRAConfig) | |
class EngineArgs: | |
"""Arguments for vLLM engine.""" | |
model: str | |
tokenizer: Optional[str] = None | |
tokenizer_mode: str = 'auto' | |
trust_remote_code: bool = False | |
download_dir: Optional[str] = None | |
load_format: str = 'auto' | |
dtype: str = 'auto' | |
kv_cache_dtype: str = 'auto' | |
seed: int = 0 | |
max_model_len: Optional[int] = None | |
worker_use_ray: bool = False | |
pipeline_parallel_size: int = 1 | |
tensor_parallel_size: int = 1 | |
max_parallel_loading_workers: Optional[int] = None | |
block_size: int = 16 | |
swap_space: int = 4 # GiB | |
gpu_memory_utilization: float = 0.90 | |
max_num_batched_tokens: Optional[int] = None | |
max_num_seqs: int = 256 | |
max_paddings: int = 256 | |
disable_log_stats: bool = False | |
revision: Optional[str] = None | |
tokenizer_revision: Optional[str] = None | |
quantization: Optional[str] = None | |
enforce_eager: bool = False | |
max_context_len_to_capture: int = 8192 | |
disable_custom_all_reduce: bool = False | |
enable_lora: bool = False | |
max_loras: int = 1 | |
max_lora_rank: int = 16 | |
lora_extra_vocab_size: int = 256 | |
lora_dtype = 'auto' | |
max_cpu_loras: Optional[int] = None | |
def __post_init__(self): | |
if self.tokenizer is None: | |
self.tokenizer = self.model | |
def add_cli_args( | |
parser: argparse.ArgumentParser) -> argparse.ArgumentParser: | |
"""Shared CLI arguments for vLLM engine.""" | |
# NOTE: If you update any of the arguments below, please also | |
# make sure to update docs/source/models/engine_args.rst | |
# Model arguments | |
parser.add_argument( | |
'--model', | |
type=str, | |
default='facebook/opt-125m', | |
help='name or path of the huggingface model to use') | |
parser.add_argument( | |
'--tokenizer', | |
type=str, | |
default=EngineArgs.tokenizer, | |
help='name or path of the huggingface tokenizer to use') | |
parser.add_argument( | |
'--revision', | |
type=str, | |
default=None, | |
help='the specific model version to use. It can be a branch ' | |
'name, a tag name, or a commit id. If unspecified, will use ' | |
'the default version.') | |
parser.add_argument( | |
'--tokenizer-revision', | |
type=str, | |
default=None, | |
help='the specific tokenizer version to use. It can be a branch ' | |
'name, a tag name, or a commit id. If unspecified, will use ' | |
'the default version.') | |
parser.add_argument('--tokenizer-mode', | |
type=str, | |
default=EngineArgs.tokenizer_mode, | |
choices=['auto', 'slow'], | |
help='tokenizer mode. "auto" will use the fast ' | |
'tokenizer if available, and "slow" will ' | |
'always use the slow tokenizer.') | |
parser.add_argument('--trust-remote-code', | |
action='store_true', | |
help='trust remote code from huggingface') | |
parser.add_argument('--download-dir', | |
type=str, | |
default=EngineArgs.download_dir, | |
help='directory to download and load the weights, ' | |
'default to the default cache dir of ' | |
'huggingface') | |
parser.add_argument( | |
'--load-format', | |
type=str, | |
default=EngineArgs.load_format, | |
choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'], | |
help='The format of the model weights to load. ' | |
'"auto" will try to load the weights in the safetensors format ' | |
'and fall back to the pytorch bin format if safetensors format ' | |
'is not available. ' | |
'"pt" will load the weights in the pytorch bin format. ' | |
'"safetensors" will load the weights in the safetensors format. ' | |
'"npcache" will load the weights in pytorch format and store ' | |
'a numpy cache to speed up the loading. ' | |
'"dummy" will initialize the weights with random values, ' | |
'which is mainly for profiling.') | |
parser.add_argument( | |
'--dtype', | |
type=str, | |
default=EngineArgs.dtype, | |
choices=[ | |
'auto', 'half', 'float16', 'bfloat16', 'float', 'float32' | |
], | |
help='data type for model weights and activations. ' | |
'The "auto" option will use FP16 precision ' | |
'for FP32 and FP16 models, and BF16 precision ' | |
'for BF16 models.') | |
parser.add_argument( | |
'--kv-cache-dtype', | |
type=str, | |
choices=['auto', 'fp8_e5m2'], | |
default='auto', | |
help='Data type for kv cache storage. If "auto", will use model ' | |
'data type. Note FP8 is not supported when cuda version is ' | |
'lower than 11.8.') | |
parser.add_argument('--max-model-len', | |
type=int, | |
default=None, | |
help='model context length. If unspecified, ' | |
'will be automatically derived from the model.') | |
# Parallel arguments | |
parser.add_argument('--worker-use-ray', | |
action='store_true', | |
help='use Ray for distributed serving, will be ' | |
'automatically set when using more than 1 GPU') | |
parser.add_argument('--pipeline-parallel-size', | |
'-pp', | |
type=int, | |
default=EngineArgs.pipeline_parallel_size, | |
help='number of pipeline stages') | |
parser.add_argument('--tensor-parallel-size', | |
'-tp', | |
type=int, | |
default=EngineArgs.tensor_parallel_size, | |
help='number of tensor parallel replicas') | |
parser.add_argument( | |
'--max-parallel-loading-workers', | |
type=int, | |
help='load model sequentially in multiple batches, ' | |
'to avoid RAM OOM when using tensor ' | |
'parallel and large models') | |
# KV cache arguments | |
parser.add_argument('--block-size', | |
type=int, | |
default=EngineArgs.block_size, | |
choices=[8, 16, 32], | |
help='token block size') | |
# TODO(woosuk): Support fine-grained seeds (e.g., seed per request). | |
parser.add_argument('--seed', | |
type=int, | |
default=EngineArgs.seed, | |
help='random seed') | |
parser.add_argument('--swap-space', | |
type=int, | |
default=EngineArgs.swap_space, | |
help='CPU swap space size (GiB) per GPU') | |
parser.add_argument( | |
'--gpu-memory-utilization', | |
type=float, | |
default=EngineArgs.gpu_memory_utilization, | |
help='the fraction of GPU memory to be used for ' | |
'the model executor, which can range from 0 to 1.' | |
'If unspecified, will use the default value of 0.9.') | |
parser.add_argument('--max-num-batched-tokens', | |
type=int, | |
default=EngineArgs.max_num_batched_tokens, | |
help='maximum number of batched tokens per ' | |
'iteration') | |
parser.add_argument('--max-num-seqs', | |
type=int, | |
default=EngineArgs.max_num_seqs, | |
help='maximum number of sequences per iteration') | |
parser.add_argument('--max-paddings', | |
type=int, | |
default=EngineArgs.max_paddings, | |
help='maximum number of paddings in a batch') | |
parser.add_argument('--disable-log-stats', | |
action='store_true', | |
help='disable logging statistics') | |
# Quantization settings. | |
parser.add_argument('--quantization', | |
'-q', | |
type=str, | |
choices=['awq', 'gptq', 'squeezellm', None], | |
default=None, | |
help='Method used to quantize the weights. If ' | |
'None, we first check the `quantization_config` ' | |
'attribute in the model config file. If that is ' | |
'None, we assume the model weights are not ' | |
'quantized and use `dtype` to determine the data ' | |
'type of the weights.') | |
parser.add_argument('--enforce-eager', | |
action='store_true', | |
help='Always use eager-mode PyTorch. If False, ' | |
'will use eager mode and CUDA graph in hybrid ' | |
'for maximal performance and flexibility.') | |
parser.add_argument('--max-context-len-to-capture', | |
type=int, | |
default=EngineArgs.max_context_len_to_capture, | |
help='maximum context length covered by CUDA ' | |
'graphs. When a sequence has context length ' | |
'larger than this, we fall back to eager mode.') | |
parser.add_argument('--disable-custom-all-reduce', | |
action='store_true', | |
default=EngineArgs.disable_custom_all_reduce, | |
help='See ParallelConfig') | |
# LoRA related configs | |
parser.add_argument('--enable-lora', | |
action='store_true', | |
help='If True, enable handling of LoRA adapters.') | |
parser.add_argument('--max-loras', | |
type=int, | |
default=EngineArgs.max_loras, | |
help='Max number of LoRAs in a single batch.') | |
parser.add_argument('--max-lora-rank', | |
type=int, | |
default=EngineArgs.max_lora_rank, | |
help='Max LoRA rank.') | |
parser.add_argument( | |
'--lora-extra-vocab-size', | |
type=int, | |
default=EngineArgs.lora_extra_vocab_size, | |
help=('Maximum size of extra vocabulary that can be ' | |
'present in a LoRA adapter (added to the base ' | |
'model vocabulary).')) | |
parser.add_argument( | |
'--lora-dtype', | |
type=str, | |
default=EngineArgs.lora_dtype, | |
choices=['auto', 'float16', 'bfloat16', 'float32'], | |
help=('Data type for LoRA. If auto, will default to ' | |
'base model dtype.')) | |
parser.add_argument( | |
'--max-cpu-loras', | |
type=int, | |
default=EngineArgs.max_cpu_loras, | |
help=('Maximum number of LoRAs to store in CPU memory. ' | |
'Must be >= than max_num_seqs. ' | |
'Defaults to max_num_seqs.')) | |
return parser | |
def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs': | |
# Get the list of attributes of this dataclass. | |
attrs = [attr.name for attr in dataclasses.fields(cls)] | |
# Set the attributes from the parsed arguments. | |
engine_args = cls(**{attr: getattr(args, attr) for attr in attrs}) | |
return engine_args | |
def create_engine_configs( | |
self, | |
) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig, | |
Optional[LoRAConfig]]: | |
model_config = ModelConfig(self.model, self.tokenizer, | |
self.tokenizer_mode, self.trust_remote_code, | |
self.download_dir, self.load_format, | |
self.dtype, self.seed, self.revision, | |
self.tokenizer_revision, self.max_model_len, | |
self.quantization, self.enforce_eager, | |
self.max_context_len_to_capture) | |
cache_config = CacheConfig(self.block_size, | |
self.gpu_memory_utilization, | |
self.swap_space, self.kv_cache_dtype, | |
model_config.get_sliding_window()) | |
parallel_config = ParallelConfig(self.pipeline_parallel_size, | |
self.tensor_parallel_size, | |
self.worker_use_ray, | |
self.max_parallel_loading_workers, | |
self.disable_custom_all_reduce) | |
scheduler_config = SchedulerConfig(self.max_num_batched_tokens, | |
self.max_num_seqs, | |
model_config.max_model_len, | |
self.max_paddings) | |
lora_config = LoRAConfig( | |
max_lora_rank=self.max_lora_rank, | |
max_loras=self.max_loras, | |
lora_extra_vocab_size=self.lora_extra_vocab_size, | |
lora_dtype=self.lora_dtype, | |
max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras | |
and self.max_cpu_loras > 0 else None) if self.enable_lora else None | |
return model_config, cache_config, parallel_config, scheduler_config, lora_config | |
class AsyncEngineArgs(EngineArgs): | |
"""Arguments for asynchronous vLLM engine.""" | |
engine_use_ray: bool = False | |
disable_log_requests: bool = False | |
max_log_len: Optional[int] = None | |
def add_cli_args( | |
parser: argparse.ArgumentParser) -> argparse.ArgumentParser: | |
parser = EngineArgs.add_cli_args(parser) | |
parser.add_argument('--engine-use-ray', | |
action='store_true', | |
help='use Ray to start the LLM engine in a ' | |
'separate process as the server process.') | |
parser.add_argument('--disable-log-requests', | |
action='store_true', | |
help='disable logging requests') | |
parser.add_argument('--max-log-len', | |
type=int, | |
default=None, | |
help='max number of prompt characters or prompt ' | |
'ID numbers being printed in log. ' | |
'Default: unlimited.') | |
return parser | |