"""Utilities for selecting and loading models.""" import contextlib import torch import torch.nn as nn from vllm.config import ModelConfig from vllm.model_executor.models import ModelRegistry from vllm.model_executor.weight_utils import get_quant_config, initialize_dummy_weights from .llama import LlamaModel @contextlib.contextmanager def _set_default_torch_dtype(dtype: torch.dtype): """Sets the default torch dtype to the given dtype.""" old_dtype = torch.get_default_dtype() torch.set_default_dtype(dtype) yield torch.set_default_dtype(old_dtype) def get_model(model_config: ModelConfig) -> nn.Module: # Get the (maybe quantized) linear method. linear_method = None if model_config.quantization is not None: quant_config = get_quant_config( model_config.quantization, model_config.model, model_config.hf_config, model_config.download_dir, ) capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] if capability < quant_config.get_min_capability(): raise ValueError( f"The quantization method {model_config.quantization} is not " "supported for the current GPU. " f"Minimum capability: {quant_config.get_min_capability()}. " f"Current capability: {capability}." ) supported_dtypes = quant_config.get_supported_act_dtypes() if model_config.dtype not in supported_dtypes: raise ValueError( f"{model_config.dtype} is not supported for quantization " f"method {model_config.quantization}. Supported dtypes: " f"{supported_dtypes}" ) linear_method = quant_config.get_linear_method() with _set_default_torch_dtype(model_config.dtype): # Create a model instance. # The weights will be initialized as empty tensors. with torch.device("cuda"): model = LlamaModel(model_config.hf_config, linear_method) if model_config.load_format == "dummy": # NOTE(woosuk): For accurate performance evaluation, we assign # random values to the weights. initialize_dummy_weights(model) else: # Load the weights from the cached or downloaded files. model.load_weights( model_config.model, model_config.download_dir, model_config.load_format, model_config.revision, ) return model.eval()