diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index 93a4de73faa89478c0968434313e03cbfe950032..0000000000000000000000000000000000000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,77 +0,0 @@ -# Contributing to vLLM - -Thank you for your interest in contributing to vLLM! -Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. -There are several ways you can contribute to the project: - -- Identify and report any issues or bugs. -- Request or add a new model. -- Suggest or implement new features. - -However, remember that contributions aren't just about code. -We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions. - -Finally, one of the most impactful ways to support us is by raising awareness about vLLM. -Talk about it in your blog posts, highlighting how it's driving your incredible projects. -Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository. - - -## Setup for development - -### Build from source - -```bash -pip install -r requirements.txt -pip install -e . # This may take several minutes. -``` - -### Testing - -```bash -pip install -r requirements-dev.txt - -# Static type checking -mypy -# Unit tests -pytest tests/ -``` -**Note:** Currently, the repository does not pass the mypy tests. - - -## Contributing Guidelines - -### Issue Reporting - -If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it. -If not, please file a new issue, providing as much relevant information as possible. - -### Coding Style Guide - -In general, we adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). - -We include a formatting script [`format.sh`](./format.sh) to format the code. - -### Pull Requests - -When submitting a pull request: - -1. Make sure your code has been rebased on top of the latest commit on the main branch. -2. Ensure code is properly formatted by running [`format.sh`](./format.sh). -3. Include a detailed description of the changes in the pull request. -Explain why you made the changes you did. -If your pull request fixes an open issue, please include a reference to it in the description. - -### Code Reviews - -All submissions, including submissions by project members, require a code review. -To make the review process as smooth as possible, please: - -1. Keep your changes as concise as possible. -If your pull request involves multiple unrelated changes, consider splitting it into separate pull requests. -2. Respond to all comments within a reasonable time frame. -If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion. - -### Thank You - -Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. -Your contributions make vLLM a great tool for everyone! diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index ce1ad4d39561114e4c4a916db803a3f38ced31ea..0000000000000000000000000000000000000000 --- a/Dockerfile +++ /dev/null @@ -1,121 +0,0 @@ -# The vLLM Dockerfile is used to construct vLLM image that can be directly used -# to run the OpenAI compatible server. - -#################### BASE BUILD IMAGE #################### -FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev - -RUN apt-get update -y \ - && apt-get install -y python3-pip git - -WORKDIR /workspace - -# install build and runtime dependencies -COPY requirements.txt requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements.txt - -# install development dependencies -COPY requirements-dev.txt requirements-dev.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements-dev.txt -#################### BASE BUILD IMAGE #################### - - -#################### EXTENSION BUILD IMAGE #################### -FROM dev AS build - -# install build dependencies -COPY requirements-build.txt requirements-build.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements-build.txt - -# copy input files -COPY csrc csrc -COPY setup.py setup.py -COPY requirements.txt requirements.txt -COPY pyproject.toml pyproject.toml -COPY vllm/__init__.py vllm/__init__.py - -# cuda arch list used by torch -ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' -ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} -# max jobs used by Ninja to build extensions -ARG max_jobs=2 -ENV MAX_JOBS=${max_jobs} -# number of threads used by nvcc -ARG nvcc_threads=8 -ENV NVCC_THREADS=$nvcc_threads -# make sure punica kernels are built (for LoRA) -ENV VLLM_INSTALL_PUNICA_KERNELS=1 - -RUN python3 setup.py build_ext --inplace -#################### EXTENSION Build IMAGE #################### - - -#################### TEST IMAGE #################### -# image to run unit testing suite -FROM dev AS test - -# copy pytorch extensions separately to avoid having to rebuild -# when python code changes -WORKDIR /vllm-workspace -# ADD is used to preserve directory structure -ADD . /vllm-workspace/ -COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/ -# ignore build dependencies installation because we are using pre-complied extensions -RUN rm pyproject.toml -RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose -#################### TEST IMAGE #################### - - -#################### RUNTIME BASE IMAGE #################### -# use CUDA base as CUDA runtime dependencies are already installed via pip -FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base - -# libnccl required for ray -RUN apt-get update -y \ - && apt-get install -y python3-pip - -WORKDIR /workspace -COPY requirements.txt requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements.txt -#################### RUNTIME BASE IMAGE #################### - - -#################### OPENAI API SERVER #################### -# openai api server alternative -FROM vllm-base AS vllm-openai -# install additional dependencies for openai api server -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate - -# Create a non-root user -RUN useradd -m appuser - -# Transfer ownership of the /workspace to the new non-root user -RUN chown -R appuser:appuser /workspace - -# Create a cache directory within the appuser's home directory and transfer ownership -RUN mkdir -p /home/appuser/cache && \ - chown -R appuser:appuser /home/appuser/cache - -# Switch to the non-root user for subsequent commands and container runtime -USER appuser - -# Set the Hugging Face cache directory environment variable -ENV TRANSFORMERS_CACHE=/home/appuser/cache - -COPY --from=build /workspace/vllm/*.so /workspace/vllm/ -COPY vllm vllm - -CMD ["python3", "-m", \ - "vllm.entrypoints.openai.api_server", \ - "--host", "0.0.0.0", \ - "--port", "7860", \ - "--served-model-name", "default", \ - "--model", "facebook/opt-125m", \ - "--max-model-len", "1024", \ - "--tensor-parallel-size", "1", \ - "--max-num-seqs", "16"] -#################### OPENAI API SERVER #################### diff --git a/Dockerfile.rocm b/Dockerfile.rocm deleted file mode 100644 index 88172fb73b937828a111492997ad96b66cad403c..0000000000000000000000000000000000000000 --- a/Dockerfile.rocm +++ /dev/null @@ -1,88 +0,0 @@ -# default base image -ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - -FROM $BASE_IMAGE - -ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - -RUN echo "Base image is $BASE_IMAGE" - -# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" -# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - -# this does not always work for all rocm versions -RUN LLVM_GFX_ARCH=$(/opt/rocm/llvm/bin/amdgpu-offload-arch) && \ - echo "LLVM_GFX_ARCH is $LLVM_GFX_ARCH" - -ARG FA_GFX_ARCHS="gfx90a;gfx942" -RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS" - -ARG FA_BRANCH="3d2b6f5" -RUN echo "FA_BRANCH is $FA_BRANCH" - -# Install some basic utilities -RUN apt-get update && apt-get install python3 python3-pip -y - -# Install some basic utilities -RUN apt-get update && apt-get install -y \ - curl \ - ca-certificates \ - sudo \ - git \ - bzip2 \ - libx11-6 \ - build-essential \ - wget \ - unzip \ - nvidia-cuda-toolkit \ - tmux \ - && rm -rf /var/lib/apt/lists/* - -### Mount Point ### -# When launching the container, mount the code directory to /app -ARG APP_MOUNT=/app -VOLUME [ ${APP_MOUNT} ] -WORKDIR ${APP_MOUNT} - -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas - -ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer -ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin: -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib: -ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/: - -# Install ROCm flash-attention -RUN mkdir libs \ - && cd libs \ - && git clone https://github.com/ROCmSoftwarePlatform/flash-attention.git \ - && cd flash-attention \ - && git checkout ${FA_BRANCH} \ - && git submodule update --init \ - && export GPU_ARCHS=${FA_GFX_ARCHS} \ - && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \ - patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \ - && python3 setup.py install \ - && cd .. - -COPY ./ /app/vllm - -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install xformers==0.0.23 --no-deps - -# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. -# Manually removed it so that later steps of numpy upgrade can continue -RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \ - rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi - -RUN cd /app \ - && cd vllm \ - && pip install -U -r requirements-rocm.txt \ - && bash patch_xformers.rocm.sh \ - && python3 setup.py install \ - && cd .. - -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install --no-cache-dir ray[all] - -CMD ["/bin/bash"] diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64..0000000000000000000000000000000000000000 --- a/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 0c897cf147f109d6a452905acfd006934fa495dc..0000000000000000000000000000000000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -include LICENSE -include requirements.txt - -recursive-include csrc * diff --git a/README.md b/README.md deleted file mode 100644 index d4dcd578545cac6572939d6210d4931bba2549fa..0000000000000000000000000000000000000000 --- a/README.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: CertifAIer Demo -emoji: ✈️ -colorFrom: blue -colorTo: blue -sdk: docker -pinned: false ---- \ No newline at end of file diff --git a/benchmarks/README.md b/benchmarks/README.md deleted file mode 100644 index 192d6c4022c839f4d8c459d572f8fa23b6ac0968..0000000000000000000000000000000000000000 --- a/benchmarks/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# Benchmarking vLLM - -## Downloading the ShareGPT dataset - -You can download the dataset by running: -```bash -wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -``` diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py deleted file mode 100644 index 7173134358762999dd60303842018698ef554b5a..0000000000000000000000000000000000000000 --- a/benchmarks/benchmark_latency.py +++ /dev/null @@ -1,139 +0,0 @@ -"""Benchmark the latency of processing a single batch of requests.""" -import argparse -import time -from pathlib import Path -from typing import Optional - -import numpy as np -import torch -from tqdm import tqdm - -from vllm import LLM, SamplingParams - - -def main(args: argparse.Namespace): - print(args) - - # NOTE(woosuk): If the request cannot be processed in a single batch, - # the engine will automatically process the request in multiple batches. - llm = LLM( - model=args.model, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - ) - - sampling_params = SamplingParams( - n=args.n, - temperature=0.0 if args.use_beam_search else 1.0, - top_p=1.0, - use_beam_search=args.use_beam_search, - ignore_eos=True, - max_tokens=args.output_len, - ) - print(sampling_params) - dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size - - def run_to_completion(profile_dir: Optional[str] = None): - if profile_dir: - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir))) as p: - llm.generate(prompt_token_ids=dummy_prompt_token_ids, - sampling_params=sampling_params, - use_tqdm=False) - print(p.key_averages()) - else: - start_time = time.perf_counter() - llm.generate(prompt_token_ids=dummy_prompt_token_ids, - sampling_params=sampling_params, - use_tqdm=False) - end_time = time.perf_counter() - latency = end_time - start_time - return latency - - print("Warming up...") - run_to_completion(profile_dir=None) - - if args.profile: - profile_dir = args.profile_result_dir - if not profile_dir: - profile_dir = Path( - "." - ) / "vllm_benchmark_result" / f"latency_result_{time.time()}" - print(f"Profiling (results will be saved to '{profile_dir}')...") - run_to_completion(profile_dir=args.profile_result_dir) - return - - # Benchmark. - latencies = [] - for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): - latencies.append(run_to_completion(profile_dir=None)) - print(f'Avg latency: {np.mean(latencies)} seconds') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Benchmark the latency of processing a single batch of ' - 'requests till completion.') - parser.add_argument('--model', type=str, default='facebook/opt-125m') - parser.add_argument('--tokenizer', type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=['awq', 'gptq', 'squeezellm', None], - default=None) - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) - parser.add_argument('--input-len', type=int, default=32) - parser.add_argument('--output-len', type=int, default=128) - parser.add_argument('--batch-size', type=int, default=8) - parser.add_argument('--n', - type=int, - default=1, - help='Number of generated sequences per prompt.') - parser.add_argument('--use-beam-search', action='store_true') - parser.add_argument('--num-iters', - type=int, - default=3, - help='Number of iterations to run.') - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--enforce-eager', - action='store_true', - help='enforce eager mode and disable CUDA graph') - parser.add_argument( - "--kv-cache-dtype", - type=str, - choices=['auto', 'fp8_e5m2'], - default='auto', - help= - 'Data type for kv cache storage. If "auto", will use model data type.') - parser.add_argument( - '--profile', - action='store_true', - help='profile the generation process of a single batch') - parser.add_argument( - '--profile-result-dir', - type=str, - default=None, - help=('path to save the pytorch profiler output. Can be visualized ' - 'with ui.perfetto.dev or Tensorboard.')) - args = parser.parse_args() - main(args) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py deleted file mode 100644 index 1a36d9d6a5debc18e75214c1ee4fd47d457f1a3e..0000000000000000000000000000000000000000 --- a/benchmarks/benchmark_serving.py +++ /dev/null @@ -1,249 +0,0 @@ -"""Benchmark online serving throughput. - -On the server side, run one of the following commands: - (vLLM backend) - python -m vllm.entrypoints.api_server \ - --model <your_model> --swap-space 16 \ - --disable-log-requests - - (TGI backend) - ./launch_hf_server.sh <your_model> - -On the client side, run: - python benchmarks/benchmark_serving.py \ - --backend <backend> \ - --tokenizer <your_model> --dataset <target_dataset> \ - --request-rate <request_rate> -""" -import argparse -import asyncio -import json -import random -import time -from typing import AsyncGenerator, List, Tuple - -import aiohttp -import numpy as np -from tqdm.asyncio import tqdm -from transformers import PreTrainedTokenizerBase -from vllm.transformers_utils.tokenizer import get_tokenizer - -# (prompt len, output len, latency) -REQUEST_LATENCY: List[Tuple[int, int, float]] = [] - - -def sample_requests( - dataset_path: str, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, -) -> List[Tuple[str, int, int]]: - # Load the dataset. - with open(dataset_path) as f: - dataset = json.load(f) - # Filter out the conversations with less than 2 turns. - dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Only keep the first two turns of each conversation. - dataset = [(data["conversations"][0]["value"], - data["conversations"][1]["value"]) for data in dataset] - - # Tokenize the prompts and completions. - prompts = [prompt for prompt, _ in dataset] - prompt_token_ids = tokenizer(prompts).input_ids - completions = [completion for _, completion in dataset] - completion_token_ids = tokenizer(completions).input_ids - tokenized_dataset = [] - for i in range(len(dataset)): - output_len = len(completion_token_ids[i]) - tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len)) - - # Filter out too long sequences. - filtered_dataset: List[Tuple[str, int, int]] = [] - for prompt, prompt_token_ids, output_len in tokenized_dataset: - prompt_len = len(prompt_token_ids) - if prompt_len < 4 or output_len < 4: - # Prune too short sequences. - # This is because TGI causes errors when the input or output length - # is too short. - continue - if prompt_len > 1024 or prompt_len + output_len > 2048: - # Prune too long sequences. - continue - filtered_dataset.append((prompt, prompt_len, output_len)) - - # Sample the requests. - sampled_requests = random.sample(filtered_dataset, num_requests) - return sampled_requests - - -async def get_request( - input_requests: List[Tuple[str, int, int]], - request_rate: float, -) -> AsyncGenerator[Tuple[str, int, int], None]: - input_requests = iter(input_requests) - for request in input_requests: - yield request - - if request_rate == float("inf"): - # If the request rate is infinity, then we don't need to wait. - continue - # Sample the request interval from the exponential distribution. - interval = np.random.exponential(1.0 / request_rate) - # The next request will be sent after the interval. - await asyncio.sleep(interval) - - -async def send_request(backend: str, model: str, api_url: str, prompt: str, - prompt_len: int, output_len: int, best_of: int, - use_beam_search: bool, pbar: tqdm) -> None: - request_start_time = time.perf_counter() - - headers = {"User-Agent": "Benchmark Client"} - if backend == "vllm": - pload = { - "prompt": prompt, - "n": 1, - "best_of": best_of, - "use_beam_search": use_beam_search, - "temperature": 0.0 if use_beam_search else 1.0, - "top_p": 1.0, - "max_tokens": output_len, - "ignore_eos": True, - "stream": False, - } - if model is not None: - pload["model"] = model - elif backend == "tgi": - assert not use_beam_search - params = { - "best_of": best_of, - "max_new_tokens": output_len, - "do_sample": True, - } - pload = { - "inputs": prompt, - "parameters": params, - } - else: - raise ValueError(f"Unknown backend: {backend}") - - timeout = aiohttp.ClientTimeout(total=3 * 3600) - async with aiohttp.ClientSession(timeout=timeout) as session: - while True: - async with session.post(api_url, headers=headers, - json=pload) as response: - chunks = [] - async for chunk, _ in response.content.iter_chunks(): - chunks.append(chunk) - output = b"".join(chunks).decode("utf-8") - output = json.loads(output) - - # Re-send the request if it failed. - if "error" not in output: - break - - request_end_time = time.perf_counter() - request_latency = request_end_time - request_start_time - REQUEST_LATENCY.append((prompt_len, output_len, request_latency)) - pbar.update(1) - - -async def benchmark( - backend: str, - model: str, - api_url: str, - input_requests: List[Tuple[str, int, int]], - best_of: int, - use_beam_search: bool, - request_rate: float, -) -> None: - tasks: List[asyncio.Task] = [] - pbar = tqdm(total=len(input_requests)) - async for request in get_request(input_requests, request_rate): - prompt, prompt_len, output_len = request - task = asyncio.create_task( - send_request(backend, model, api_url, prompt, prompt_len, - output_len, best_of, use_beam_search, pbar)) - tasks.append(task) - await asyncio.gather(*tasks) - pbar.close() - - -def main(args: argparse.Namespace): - print(args) - random.seed(args.seed) - np.random.seed(args.seed) - - api_url = f"{args.protocol}://{args.host}:{args.port}{args.endpoint}" - tokenizer = get_tokenizer(args.tokenizer, - trust_remote_code=args.trust_remote_code) - input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer) - - benchmark_start_time = time.perf_counter() - asyncio.run( - benchmark(args.backend, args.model, api_url, input_requests, - args.best_of, args.use_beam_search, args.request_rate)) - benchmark_end_time = time.perf_counter() - benchmark_time = benchmark_end_time - benchmark_start_time - print(f"Total time: {benchmark_time:.2f} s") - print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s") - - # Compute the latency statistics. - avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY]) - print(f"Average latency: {avg_latency:.2f} s") - avg_per_token_latency = np.mean([ - latency / (prompt_len + output_len) - for prompt_len, output_len, latency in REQUEST_LATENCY - ]) - print(f"Average latency per token: {avg_per_token_latency:.2f} s") - avg_per_output_token_latency = np.mean( - [latency / output_len for _, output_len, latency in REQUEST_LATENCY]) - print("Average latency per output token: " - f"{avg_per_output_token_latency:.2f} s") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Benchmark the online serving throughput.") - parser.add_argument("--backend", - type=str, - default="vllm", - choices=["vllm", "tgi"]) - parser.add_argument("--protocol", - type=str, - default="http", - choices=["http", "https"]) - parser.add_argument("--host", type=str, default="localhost") - parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--endpoint", type=str, default="/generate") - parser.add_argument("--model", type=str, default=None) - parser.add_argument("--dataset", - type=str, - required=True, - help="Path to the dataset.") - parser.add_argument("--tokenizer", - type=str, - required=True, - help="Name or path of the tokenizer.") - parser.add_argument("--best-of", - type=int, - default=1, - help="Generates `best_of` sequences per prompt and " - "returns the best one.") - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument("--num-prompts", - type=int, - default=1000, - help="Number of prompts to process.") - parser.add_argument("--request-rate", - type=float, - default=float("inf"), - help="Number of requests per second. If this is inf, " - "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process to synthesize " - "the request arrival times.") - parser.add_argument("--seed", type=int, default=0) - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - args = parser.parse_args() - main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py deleted file mode 100644 index d45d33307c9124c365e73c30a62771ce5082d2e4..0000000000000000000000000000000000000000 --- a/benchmarks/benchmark_throughput.py +++ /dev/null @@ -1,328 +0,0 @@ -"""Benchmark offline inference throughput.""" -import argparse -import json -import random -import time -from typing import List, Optional, Tuple - -import torch -from transformers import (AutoModelForCausalLM, AutoTokenizer, - PreTrainedTokenizerBase) -from tqdm import tqdm - - -def sample_requests( - dataset_path: str, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int], -) -> List[Tuple[str, int, int]]: - if fixed_output_len is not None and fixed_output_len < 4: - raise ValueError("output_len too small") - - # Load the dataset. - with open(dataset_path) as f: - dataset = json.load(f) - # Filter out the conversations with less than 2 turns. - dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Only keep the first two turns of each conversation. - dataset = [(data["conversations"][0]["value"], - data["conversations"][1]["value"]) for data in dataset] - - # Tokenize the prompts and completions. - prompts = [prompt for prompt, _ in dataset] - prompt_token_ids = tokenizer(prompts).input_ids - completions = [completion for _, completion in dataset] - completion_token_ids = tokenizer(completions).input_ids - tokenized_dataset = [] - for i in range(len(dataset)): - output_len = len(completion_token_ids[i]) - if fixed_output_len is not None: - output_len = fixed_output_len - tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len)) - - # Filter out too long sequences. - filtered_dataset: List[Tuple[str, int, int]] = [] - for prompt, prompt_token_ids, output_len in tokenized_dataset: - prompt_len = len(prompt_token_ids) - if prompt_len < 4 or output_len < 4: - # Prune too short sequences. - continue - if prompt_len > 1024 or prompt_len + output_len > 2048: - # Prune too long sequences. - continue - filtered_dataset.append((prompt, prompt_len, output_len)) - - # Sample the requests. - sampled_requests = random.sample(filtered_dataset, num_requests) - return sampled_requests - - -def run_vllm( - requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, - n: int, - use_beam_search: bool, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, -) -> float: - from vllm import LLM, SamplingParams - llm = LLM( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - ) - - # Add the requests to the engine. - for prompt, _, output_len in requests: - sampling_params = SamplingParams( - n=n, - temperature=0.0 if use_beam_search else 1.0, - top_p=1.0, - use_beam_search=use_beam_search, - ignore_eos=True, - max_tokens=output_len, - ) - # FIXME(woosuk): Do not use internal method. - llm._add_request( - prompt=prompt, - prompt_token_ids=None, - sampling_params=sampling_params, - ) - - start = time.perf_counter() - # FIXME(woosuk): Do not use internal method. - llm._run_engine(use_tqdm=True) - end = time.perf_counter() - return end - start - - -def run_hf( - requests: List[Tuple[str, int, int]], - model: str, - tokenizer: PreTrainedTokenizerBase, - n: int, - use_beam_search: bool, - max_batch_size: int, - trust_remote_code: bool, -) -> float: - assert not use_beam_search - llm = AutoModelForCausalLM.from_pretrained( - model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) - if llm.config.model_type == "llama": - # To enable padding in the HF backend. - tokenizer.pad_token = tokenizer.eos_token - llm = llm.cuda() - - pbar = tqdm(total=len(requests)) - start = time.perf_counter() - batch: List[str] = [] - max_prompt_len = 0 - max_output_len = 0 - for i in range(len(requests)): - prompt, prompt_len, output_len = requests[i] - # Add the prompt to the batch. - batch.append(prompt) - max_prompt_len = max(max_prompt_len, prompt_len) - max_output_len = max(max_output_len, output_len) - if len(batch) < max_batch_size and i != len(requests) - 1: - # Check if we can add more requests to the batch. - _, next_prompt_len, next_output_len = requests[i + 1] - if (max(max_prompt_len, next_prompt_len) + - max(max_output_len, next_output_len)) <= 2048: - # We can add more requests to the batch. - continue - - # Generate the sequences. - input_ids = tokenizer(batch, return_tensors="pt", - padding=True).input_ids - llm_outputs = llm.generate( - input_ids=input_ids.cuda(), - do_sample=not use_beam_search, - num_return_sequences=n, - temperature=1.0, - top_p=1.0, - use_cache=True, - max_new_tokens=max_output_len, - ) - # Include the decoding time. - tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) - pbar.update(len(batch)) - - # Clear the batch. - batch = [] - max_prompt_len = 0 - max_output_len = 0 - end = time.perf_counter() - return end - start - - -def run_mii( - requests: List[Tuple[str, int, int]], - model: str, - tensor_parallel_size: int, - output_len: int, -) -> float: - from mii import pipeline - llm = pipeline(model, tensor_parallel=tensor_parallel_size) - prompts = [prompt for prompt, _, _ in requests] - - start = time.perf_counter() - llm(prompts, max_new_tokens=output_len) - end = time.perf_counter() - return end - start - - -def main(args: argparse.Namespace): - print(args) - random.seed(args.seed) - - # Sample the requests. - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, trust_remote_code=args.trust_remote_code) - if args.dataset is None: - # Synthesize a prompt with the given input length. - prompt = "hi" * (args.input_len - 1) - requests = [(prompt, args.input_len, args.output_len) - for _ in range(args.num_prompts)] - else: - requests = sample_requests(args.dataset, args.num_prompts, tokenizer, - args.output_len) - - if args.backend == "vllm": - elapsed_time = run_vllm(requests, args.model, args.tokenizer, - args.quantization, args.tensor_parallel_size, - args.seed, args.n, args.use_beam_search, - args.trust_remote_code, args.dtype, - args.max_model_len, args.enforce_eager, - args.kv_cache_dtype) - elif args.backend == "hf": - assert args.tensor_parallel_size == 1 - elapsed_time = run_hf(requests, args.model, tokenizer, args.n, - args.use_beam_search, args.hf_max_batch_size, - args.trust_remote_code) - elif args.backend == "mii": - elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, - args.output_len) - else: - raise ValueError(f"Unknown backend: {args.backend}") - total_num_tokens = sum(prompt_len + output_len - for _, prompt_len, output_len in requests) - print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " - f"{total_num_tokens / elapsed_time:.2f} tokens/s") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Benchmark the throughput.") - parser.add_argument("--backend", - type=str, - choices=["vllm", "hf", "mii"], - default="vllm") - parser.add_argument("--dataset", - type=str, - default=None, - help="Path to the dataset.") - parser.add_argument("--input-len", - type=int, - default=None, - help="Input prompt length for each request") - parser.add_argument("--output-len", - type=int, - default=None, - help="Output length for each request. Overrides the " - "output length from the dataset.") - parser.add_argument("--model", type=str, default="facebook/opt-125m") - parser.add_argument("--tokenizer", type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=['awq', 'gptq', 'squeezellm', None], - default=None) - parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) - parser.add_argument("--n", - type=int, - default=1, - help="Number of generated sequences per prompt.") - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument("--num-prompts", - type=int, - default=1000, - help="Number of prompts to process.") - parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--hf-max-batch-size", - type=int, - default=None, - help="Maximum batch size for HF backend.") - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument("--enforce-eager", - action="store_true", - help="enforce eager execution") - parser.add_argument( - "--kv-cache-dtype", - type=str, - choices=["auto", "fp8_e5m2"], - default="auto", - help= - 'Data type for kv cache storage. If "auto", will use model data type.') - args = parser.parse_args() - if args.tokenizer is None: - args.tokenizer = args.model - if args.dataset is None: - assert args.input_len is not None - assert args.output_len is not None - else: - assert args.input_len is None - - if args.backend == "vllm": - if args.hf_max_batch_size is not None: - raise ValueError("HF max batch size is only for HF backend.") - elif args.backend == "hf": - if args.hf_max_batch_size is None: - raise ValueError("HF max batch size is required for HF backend.") - if args.quantization is not None: - raise ValueError("Quantization is only for vLLM backend.") - elif args.backend == "mii": - if args.dtype != "auto": - raise ValueError("dtype must be auto for MII backend.") - if args.n != 1: - raise ValueError("n must be 1 for MII backend.") - if args.use_beam_search: - raise ValueError("Beam search is not supported for MII backend.") - if args.quantization is not None: - raise ValueError("Quantization is only for vLLM backend.") - if args.hf_max_batch_size is not None: - raise ValueError("HF max batch size is only for HF backend.") - if args.tokenizer != args.model: - raise ValueError("Tokenizer must be the same as the model for MII " - "backend.") - main(args) diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py deleted file mode 100644 index 56fe1b921d44ecee919d10ae49feb3725ce3448d..0000000000000000000000000000000000000000 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ /dev/null @@ -1,196 +0,0 @@ -from typing import Optional -import argparse -import random -import time - -import torch - -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random -from vllm._C import ops - -NUM_BLOCKS = 1024 -PARTITION_SIZE = 512 - - -@torch.inference_mode() -def main( - version: str, - num_seqs: int, - context_len: int, - num_query_heads: int, - num_kv_heads: int, - head_size: int, - use_alibi: bool, - block_size: int, - dtype: torch.dtype, - seed: int, - do_profile: bool, - kv_cache_dtype: Optional[str] = None, -) -> None: - random.seed(seed) - torch.random.manual_seed(seed) - torch.cuda.manual_seed(seed) - - scale = float(1.0 / (head_size**0.5)) - query = torch.empty(num_seqs, - num_query_heads, - head_size, - dtype=dtype, - device="cuda") - query.uniform_(-scale, scale) - - assert num_query_heads % num_kv_heads == 0 - alibi_slopes = None - if use_alibi: - alibi_slopes = torch.randn(num_query_heads, - dtype=torch.float, - device="cuda") - - context_lens = [context_len for _ in range(num_seqs)] - max_context_len = max(context_lens) - context_lens = torch.tensor(context_lens, dtype=torch.int, device="cuda") - - # Create the block tables. - max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size - block_tables = [] - for _ in range(num_seqs): - block_table = [ - random.randint(0, NUM_BLOCKS - 1) - for _ in range(max_num_blocks_per_seq) - ] - block_tables.append(block_table) - block_tables = torch.tensor(block_tables, dtype=torch.int, device="cuda") - - # Create the KV cache. - key_caches, value_caches = create_kv_caches_with_random( - NUM_BLOCKS, block_size, 1, num_kv_heads, head_size, kv_cache_dtype, - dtype) - key_cache, value_cache = key_caches[0], value_caches[0] - - # Prepare for the paged attention kernel. - output = torch.empty_like(query) - if version == "v2": - num_partitions = ((max_context_len + PARTITION_SIZE - 1) // - PARTITION_SIZE) - tmp_output = torch.empty( - size=(num_seqs, num_query_heads, num_partitions, head_size), - dtype=output.dtype, - device=output.device, - ) - exp_sums = torch.empty( - size=(num_seqs, num_query_heads, num_partitions), - dtype=torch.float32, - device=output.device, - ) - max_logits = torch.empty_like(exp_sums) - - def run_benchmark(num_iters: int, profile: bool = False) -> float: - torch.cuda.synchronize() - if profile: - torch.cuda.cudart().cudaProfilerStart() - start_time = time.perf_counter() - - for _ in range(num_iters): - if version == "v1": - ops.paged_attention_v1( - output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - block_tables, - context_lens, - block_size, - max_context_len, - alibi_slopes, - kv_cache_dtype, - ) - elif version == "v2": - ops.paged_attention_v2( - output, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - block_tables, - context_lens, - block_size, - max_context_len, - alibi_slopes, - kv_cache_dtype, - ) - else: - raise ValueError(f"Invalid version: {version}") - torch.cuda.synchronize() - - end_time = time.perf_counter() - if profile: - torch.cuda.cudart().cudaProfilerStart() - return (end_time - start_time) / num_iters - - # Warmup. - print("Warming up...") - run_benchmark(num_iters=3, profile=False) - - # Benchmark. - if do_profile: - latency = run_benchmark(num_iters=1, profile=True) - else: - latency = run_benchmark(num_iters=100, profile=False) - print(f"Kernel running time: {latency * 1000000:.3f} us") - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="Benchmark the paged attention kernel.") - parser.add_argument("--version", - type=str, - choices=["v1", "v2"], - default="v2") - parser.add_argument("--batch-size", type=int, default=8) - parser.add_argument("--context-len", type=int, default=4096) - parser.add_argument("--num-query-heads", type=int, default=64) - parser.add_argument("--num-kv-heads", type=int, default=8) - parser.add_argument("--head-size", - type=int, - choices=[64, 80, 96, 112, 128, 256], - default=128) - parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) - parser.add_argument("--use-alibi", action="store_true") - parser.add_argument("--dtype", - type=str, - choices=["half", "bfloat16", "float"], - default="half") - parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--profile", action="store_true") - parser.add_argument( - "--kv-cache-dtype", - type=str, - choices=["auto", "fp8_e5m2"], - default="auto", - help= - 'Data type for kv cache storage. If "auto", will use model data type.') - args = parser.parse_args() - print(args) - - if args.num_query_heads % args.num_kv_heads != 0: - raise ValueError("num_query_heads must be divisible by num_kv_heads") - main( - version=args.version, - num_seqs=args.batch_size, - context_len=args.context_len, - num_query_heads=args.num_query_heads, - num_kv_heads=args.num_kv_heads, - head_size=args.head_size, - block_size=args.block_size, - use_alibi=args.use_alibi, - dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], - seed=args.seed, - do_profile=args.profile, - kv_cache_dtype=args.kv_cache_dtype, - ) diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh deleted file mode 100755 index bdb25b78d85b477877e61b4fb8f277714aa2a851..0000000000000000000000000000000000000000 --- a/benchmarks/launch_tgi_server.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -PORT=8000 -MODEL=$1 -TOKENS=$2 - -docker run --gpus all --shm-size 1g -p $PORT:80 \ - -v $PWD/data:/data \ - ghcr.io/huggingface/text-generation-inference:0.8 \ - --model-id $MODEL \ - --sharded false \ - --max-input-length 1024 \ - --max-total-tokens 2048 \ - --max-best-of 5 \ - --max-concurrent-requests 5000 \ - --max-batch-total-tokens $TOKENS diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu deleted file mode 100644 index 5ba9ab178d5a42a95c4ce622db14136de85b44ad..0000000000000000000000000000000000000000 --- a/csrc/activation_kernels.cu +++ /dev/null @@ -1,118 +0,0 @@ -#include <ATen/cuda/CUDAContext.h> -#include <torch/extension.h> -#include <c10/cuda/CUDAGuard.h> - -#include "cuda_compat.h" -#include "dispatch_utils.h" - -namespace vllm { - -template<typename T> -__device__ __forceinline__ T silu(const T& x) { - // x * sigmoid(x) - return (T) (((float) x) / (1.0f + expf((float) -x))); -} - -template<typename scalar_t> -__global__ void silu_and_mul_kernel( - scalar_t* __restrict__ out, // [..., d] - const scalar_t* __restrict__ input, // [..., 2, d] - const int d) { - const int64_t token_idx = blockIdx.x; - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); - const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); - out[token_idx * d + idx] = silu(x) * y; - } -} - -} // namespace vllm - -void silu_and_mul( - torch::Tensor& out, // [..., d] - torch::Tensor& input) // [..., 2 * d] -{ - int64_t num_tokens = input.numel() / input.size(-1); - int d = input.size(-1) / 2; - - dim3 grid(num_tokens); - dim3 block(std::min(d, 1024)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), - "silu_and_mul_kernel", - [&] { - vllm::silu_and_mul_kernel<scalar_t><<<grid, block, 0, stream>>>( - out.data_ptr<scalar_t>(), - input.data_ptr<scalar_t>(), - d); - }); -} - -namespace vllm { - -// Element-wise activation kernel template. -template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)> -__global__ void activation_kernel( - scalar_t* __restrict__ out, // [..., d] - const scalar_t* __restrict__ input, // [..., d] - const int d) { - const int64_t token_idx = blockIdx.x; - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]); - out[token_idx * d + idx] = ACT_FN(x); - } -} - -} // namespace vllm - -// Launch element-wise activation kernel. -#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \ - int d = input.size(-1); \ - int64_t num_tokens = input.numel() / d; \ - dim3 grid(num_tokens); \ - dim3 block(std::min(d, 1024)); \ - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ - VLLM_DISPATCH_FLOATING_TYPES( \ - input.scalar_type(), \ - "activation_kernel", \ - [&] { \ - vllm::activation_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>( \ - out.data_ptr<scalar_t>(), \ - input.data_ptr<scalar_t>(), \ - d); \ - }); - -namespace vllm { - -template<typename T> -__device__ __forceinline__ T gelu_new_kernel(const T& x) { - const float x3 = (float) (x * x * x); - const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3)))); - return ((T) 0.5) * x * (((T) 1.0) + t); -} - -template<typename T> -__device__ __forceinline__ T gelu_fast_kernel(const T& x) { - const float f = (float) x; - const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x)); - return ((T) 0.5) * x * (((T) 1.0) + t); -} - -} // namespace vllm - -void gelu_new( - torch::Tensor& out, // [..., d] - torch::Tensor& input) // [..., d] -{ - LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel); -} - -void gelu_fast( - torch::Tensor& out, // [..., d] - torch::Tensor& input) // [..., d] -{ - LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel); -} diff --git a/csrc/attention/attention_dtypes.h b/csrc/attention/attention_dtypes.h deleted file mode 100644 index 61748e6b1eee6ffab04d3da2b0da7da6546fef43..0000000000000000000000000000000000000000 --- a/csrc/attention/attention_dtypes.h +++ /dev/null @@ -1,7 +0,0 @@ -#pragma once - -#include "attention_generic.cuh" -#include "dtype_float16.cuh" -#include "dtype_float32.cuh" -#include "dtype_bfloat16.cuh" -#include "dtype_fp8_e5m2.cuh" diff --git a/csrc/attention/attention_generic.cuh b/csrc/attention/attention_generic.cuh deleted file mode 100644 index 31fb401cbe2c158bb7e3dfad266e794e5da58abc..0000000000000000000000000000000000000000 --- a/csrc/attention/attention_generic.cuh +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h - * Copyright (c) 2023, The vLLM team. - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include <stdint.h> - -namespace vllm { - -// A vector type to store Q, K, V elements. -template<typename T, int VEC_SIZE> -struct Vec {}; - -// A vector type to store FP32 accumulators. -template<typename T> -struct FloatVec {}; - -// Template vector operations. -template<typename Acc, typename A, typename B> -inline __device__ Acc mul(A a, B b); - -template<typename T> -inline __device__ float sum(T v); - -template<typename T> -inline __device__ float dot(T a, T b) { - return sum(mul<T, T, T>(a, b)); -} - -template<typename A, typename T> -inline __device__ float dot(T a, T b) { - return sum(mul<A, T, T>(a, b)); -} - -template<typename T> -inline __device__ void zero(T& dst) { - constexpr int WORDS = sizeof(T) / 4; - union { - T raw; - uint32_t words[WORDS]; - } tmp; - -#pragma unroll - for (int ii = 0; ii < WORDS; ++ii) { - tmp.words[ii] = 0u; - } - dst = tmp.raw; -} - -} // namespace vllm diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu deleted file mode 100644 index a5ddeac74044004f9a728cbeb57dd339175ebf44..0000000000000000000000000000000000000000 --- a/csrc/attention/attention_kernels.cu +++ /dev/null @@ -1,951 +0,0 @@ -/* - * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp - * Copyright (c) 2023, The vLLM team. - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifdef USE_ROCM -#include <hip/hip_runtime.h> -#endif - -#include <torch/extension.h> -#include <ATen/cuda/CUDAContext.h> -#include <c10/cuda/CUDAGuard.h> - -#include "attention_dtypes.h" -#include "attention_utils.cuh" -#include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh" - -#include <algorithm> - -#ifndef USE_ROCM -#define WARP_SIZE 32 -#else -#define WARP_SIZE warpSize -#endif -#define MAX(a, b) ((a) > (b) ? (a) : (b)) -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) - -namespace vllm { - -// Utility function for attention softmax. -template<int NUM_WARPS> -inline __device__ float block_sum(float* red_smem, float sum) { - // Decompose the thread index into warp / lane. - int warp = threadIdx.x / WARP_SIZE; - int lane = threadIdx.x % WARP_SIZE; - - // Compute the sum per warp. -#pragma unroll - for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { - sum += VLLM_SHFL_XOR_SYNC(sum, mask); - } - - // Warp leaders store the data to shared memory. - if (lane == 0) { - red_smem[warp] = sum; - } - - // Make sure the data is in shared memory. - __syncthreads(); - - // The warps compute the final sums. - if (lane < NUM_WARPS) { - sum = red_smem[lane]; - } - - // Parallel reduction inside the warp. -#pragma unroll - for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { - sum += VLLM_SHFL_XOR_SYNC(sum, mask); - } - - // Broadcast to other threads. - return VLLM_SHFL_SYNC(sum, 0); -} - -// TODO(woosuk): Merge the last two dimensions of the grid. -// Grid: (num_heads, num_seqs, max_num_partitions). -template< - typename scalar_t, - typename cache_t, - int HEAD_SIZE, - int BLOCK_SIZE, - int NUM_THREADS, - bool IS_FP8_E5M2_KV_CACHE, - int PARTITION_SIZE = 0> // Zero means no partitioning. -__device__ void paged_attention_kernel( - float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] - float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] - scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] - const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] - const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] - const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] - const int num_kv_heads, // [num_heads] - const float scale, - const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ context_lens, // [num_seqs] - const int max_num_blocks_per_seq, - const float* __restrict__ alibi_slopes, // [num_heads] - const int q_stride, - const int kv_block_stride, - const int kv_head_stride) { - const int seq_idx = blockIdx.y; - const int partition_idx = blockIdx.z; - const int max_num_partitions = gridDim.z; - constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0; - const int context_len = context_lens[seq_idx]; - if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= context_len) { - // No work to do. Terminate the thread block. - return; - } - - const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE); - const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_context_blocks; - - // [start_block_idx, end_block_idx) is the range of blocks to process. - const int start_block_idx = USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0; - const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_context_blocks); - const int num_blocks = end_block_idx - start_block_idx; - - // [start_token_idx, end_token_idx) is the range of tokens to process. - const int start_token_idx = start_block_idx * BLOCK_SIZE; - const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len); - const int num_tokens = end_token_idx - start_token_idx; - - constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1); - constexpr int NUM_THREAD_GROUPS = NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE divides NUM_THREADS - assert(NUM_THREADS % THREAD_GROUP_SIZE == 0); - constexpr int NUM_TOKENS_PER_THREAD_GROUP = DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE); - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; - const int thread_idx = threadIdx.x; - const int warp_idx = thread_idx / WARP_SIZE; - const int lane = thread_idx % WARP_SIZE; - - const int head_idx = blockIdx.x; - const int num_heads = gridDim.x; - const int num_queries_per_kv = num_heads / num_kv_heads; - const int kv_head_idx = head_idx / num_queries_per_kv; - const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx]; - - // A vector type to store a part of a key or a query. - // The vector size is configured in such a way that the threads in a thread group - // fetch or compute 16 bytes at a time. - // For example, if the size of a thread group is 4 and the data type is half, - // then the vector size is 16 / (4 * sizeof(half)) == 2. - constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1); - using K_vec = typename Vec<scalar_t, VEC_SIZE>::Type; - using Q_vec = typename Vec<scalar_t, VEC_SIZE>::Type; -#ifdef ENABLE_FP8_E5M2 - using Quant_vec = typename Vec<cache_t, VEC_SIZE>::Type; -#endif - - constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE; - constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE; - - const int thread_group_idx = thread_idx / THREAD_GROUP_SIZE; - const int thread_group_offset = thread_idx % THREAD_GROUP_SIZE; - - // Load the query to registers. - // Each thread in a thread group has a different part of the query. - // For example, if the the thread group size is 4, then the first thread in the group - // has 0, 4, 8, ... th vectors of the query, and the second thread has 1, 5, 9, ... - // th vectors of the query, and so on. - // NOTE(woosuk): Because q is split from a qkv tensor, it may not be contiguous. - const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; - __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; -#pragma unroll - for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD; i += NUM_THREAD_GROUPS) { - const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE; - q_vecs[thread_group_offset][i] = *reinterpret_cast<const Q_vec*>(q_ptr + vec_idx * VEC_SIZE); - } - __syncthreads(); // TODO(naed90): possible speedup if this is replaced with a memory wall right before we use q_vecs - - // Memory planning. - extern __shared__ char shared_mem[]; - // NOTE(woosuk): We use FP32 for the softmax logits for better accuracy. - float* logits = reinterpret_cast<float*>(shared_mem); - // Workspace for reduction. - __shared__ float red_smem[2 * NUM_WARPS]; - - // x == THREAD_GROUP_SIZE * VEC_SIZE - // Each thread group fetches x elements from the key at a time. - constexpr int x = 16 / sizeof(cache_t); - float qk_max = -FLT_MAX; - - // Iterate over the key blocks. - // Each warp fetches a block of keys for each iteration. - // Each thread group in a warp fetches a key from the block, and computes - // dot product with the query. - const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq; - for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) { - // NOTE(woosuk): The block number is stored in int32. However, we cast it to int64 - // because int32 can lead to overflow when this variable is multiplied by large numbers - // (e.g., kv_block_stride). - const int64_t physical_block_number = static_cast<int64_t>(block_table[block_idx]); - - // Load a key to registers. - // Each thread in a thread group has a different part of the key. - // For example, if the the thread group size is 4, then the first thread in the group - // has 0, 4, 8, ... th vectors of the key, and the second thread has 1, 5, 9, ... th - // vectors of the key, and so on. - for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) { - const int physical_block_offset = (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE; - const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset; - K_vec k_vecs[NUM_VECS_PER_THREAD]; - -#pragma unroll - for (int j = 0; j < NUM_VECS_PER_THREAD; j++) { - const cache_t* k_ptr = k_cache + physical_block_number * kv_block_stride - + kv_head_idx * kv_head_stride - + physical_block_offset * x; - const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE; - const int offset1 = (vec_idx * VEC_SIZE) / x; - const int offset2 = (vec_idx * VEC_SIZE) % x; - if constexpr (IS_FP8_E5M2_KV_CACHE) { -#ifdef ENABLE_FP8_E5M2 - Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(k_ptr + offset1 * BLOCK_SIZE * x + offset2); - // Vector conversion from Quant_vec to K_vec. - k_vecs[j] = fp8_e5m2_unscaled::vec_conversion<K_vec, Quant_vec>(k_vec_quant); -#else - assert(false); -#endif - } else { - k_vecs[j] = *reinterpret_cast<const K_vec*>(k_ptr + offset1 * BLOCK_SIZE * x + offset2); - } - } - - // Compute dot product. - // This includes a reduction across the threads in the same thread group. - float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs); - // Add the ALiBi bias if slopes are given. - qk += (alibi_slope != 0) ? alibi_slope * (token_idx - context_len + 1) : 0; - - if (thread_group_offset == 0) { - // Store the partial reductions to shared memory. - // NOTE(woosuk): It is required to zero out the masked logits. - const bool mask = token_idx >= context_len; - logits[token_idx - start_token_idx] = mask ? 0.f : qk; - // Update the max value. - qk_max = mask ? qk_max : fmaxf(qk_max, qk); - } - } - } - - // Perform reduction across the threads in the same warp to get the - // max qk value for each "warp" (not across the thread block yet). - // The 0-th thread of each thread group already has its max qk value. -#pragma unroll - for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { - qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); - } - if (lane == 0) { - red_smem[warp_idx] = qk_max; - } - __syncthreads(); - - // TODO(woosuk): Refactor this part. - // Get the max qk value for the sequence. - qk_max = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX; -#pragma unroll - for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { - qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); - } - // Broadcast the max qk value to all threads. - qk_max = VLLM_SHFL_SYNC(qk_max, 0); - - // Get the sum of the exp values. - float exp_sum = 0.f; - for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { - float val = __expf(logits[i] - qk_max); - logits[i] = val; - exp_sum += val; - } - exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum); - - // Compute softmax. - const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f); - for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { - logits[i] *= inv_sum; - } - __syncthreads(); - - // If partitioning is enabled, store the max logit and exp_sum. - if (USE_PARTITIONING && thread_idx == 0) { - float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions - + head_idx * max_num_partitions - + partition_idx; - *max_logits_ptr = qk_max; - float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions - + head_idx * max_num_partitions - + partition_idx; - *exp_sums_ptr = exp_sum; - } - - // Each thread will fetch 16 bytes from the value cache at a time. - constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE); - using V_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type; - using L_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type; -#ifdef ENABLE_FP8_E5M2 - using V_quant_vec = typename Vec<cache_t, V_VEC_SIZE>::Type; -#endif - using Float_L_vec = typename FloatVec<L_vec>::Type; - - constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE; - constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW; - constexpr int NUM_ROWS_PER_THREAD = DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER); - - // NOTE(woosuk): We use FP32 for the accumulator for better accuracy. - float accs[NUM_ROWS_PER_THREAD]; -#pragma unroll - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - accs[i] = 0.f; - } - - scalar_t zero_value; - zero(zero_value); - for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) { - // NOTE(woosuk): The block number is stored in int32. However, we cast it to int64 - // because int32 can lead to overflow when this variable is multiplied by large numbers - // (e.g., kv_block_stride). - const int64_t physical_block_number = static_cast<int64_t>(block_table[block_idx]); - const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE; - const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset; - L_vec logits_vec; - from_float(logits_vec, *reinterpret_cast<Float_L_vec*>(logits + token_idx - start_token_idx)); - - const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride - + kv_head_idx * kv_head_stride; -#pragma unroll - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; - if (row_idx < HEAD_SIZE) { - const int offset = row_idx * BLOCK_SIZE + physical_block_offset; - V_vec v_vec; - if constexpr (IS_FP8_E5M2_KV_CACHE) { -#ifdef ENABLE_FP8_E5M2 - V_quant_vec v_quant_vec = *reinterpret_cast<const V_quant_vec*>(v_ptr + offset); - // Vector conversion from V_quant_vec to V_vec. - v_vec = fp8_e5m2_unscaled::vec_conversion<V_vec, V_quant_vec>(v_quant_vec); -#else - assert(false); -#endif - } else { - v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset); - } - if (block_idx == num_context_blocks - 1) { - // NOTE(woosuk): When v_vec contains the tokens that are out of the context, - // we should explicitly zero out the values since they may contain NaNs. - // See https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472 - scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec); -#pragma unroll - for (int j = 0; j < V_VEC_SIZE; j++) { - v_vec_ptr[j] = token_idx + j < context_len ? v_vec_ptr[j] : zero_value; - } - } - accs[i] += dot(logits_vec, v_vec); - } - } - } - - // Perform reduction within each warp. -#pragma unroll - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - float acc = accs[i]; -#pragma unroll - for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) { - acc += VLLM_SHFL_XOR_SYNC(acc, mask); - } - accs[i] = acc; - } - - // NOTE(woosuk): A barrier is required because the shared memory space for logits - // is reused for the output. - __syncthreads(); - - // Perform reduction across warps. - float* out_smem = reinterpret_cast<float*>(shared_mem); -#pragma unroll - for (int i = NUM_WARPS; i > 1; i /= 2) { - int mid = i / 2; - // Upper warps write to shared memory. - if (warp_idx >= mid && warp_idx < i) { - float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; -#pragma unroll - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; - if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) { - dst[row_idx] = accs[i]; - } - } - } - __syncthreads(); - - // Lower warps update the output. - if (warp_idx < mid) { - const float* src = &out_smem[warp_idx * HEAD_SIZE]; -#pragma unroll - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; - if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) { - accs[i] += src[row_idx]; - } - } - } - __syncthreads(); - } - - // Write the final output. - if (warp_idx == 0) { - scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE - + head_idx * max_num_partitions * HEAD_SIZE - + partition_idx * HEAD_SIZE; -#pragma unroll - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; - if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) { - from_float(*(out_ptr + row_idx), accs[i]); - } - } - } -} - -// Grid: (num_heads, num_seqs, 1). -template< - typename scalar_t, - typename cache_t, - int HEAD_SIZE, - int BLOCK_SIZE, - int NUM_THREADS, - bool IS_FP8_E5M2_KV_CACHE> -__global__ void paged_attention_v1_kernel( - scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size] - const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] - const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] - const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] - const int num_kv_heads, // [num_heads] - const float scale, - const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ context_lens, // [num_seqs] - const int max_num_blocks_per_seq, - const float* __restrict__ alibi_slopes, // [num_heads] - const int q_stride, - const int kv_block_stride, - const int kv_head_stride) { - paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_E5M2_KV_CACHE>( - /* exp_sums */ nullptr, /* max_logits */ nullptr, - out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens, - max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride); -} - -// Grid: (num_heads, num_seqs, max_num_partitions). -template< - typename scalar_t, - typename cache_t, - int HEAD_SIZE, - int BLOCK_SIZE, - int NUM_THREADS, - bool IS_FP8_E5M2_KV_CACHE, - int PARTITION_SIZE> -__global__ void paged_attention_v2_kernel( - float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] - float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] - scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] - const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] - const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] - const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] - const int num_kv_heads, // [num_heads] - const float scale, - const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] - const int* __restrict__ context_lens, // [num_seqs] - const int max_num_blocks_per_seq, - const float* __restrict__ alibi_slopes, // [num_heads] - const int q_stride, - const int kv_block_stride, - const int kv_head_stride) { - paged_attention_kernel<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, IS_FP8_E5M2_KV_CACHE, PARTITION_SIZE>( - exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale, - block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes, - q_stride, kv_block_stride, kv_head_stride); -} - -// Grid: (num_heads, num_seqs). -template< - typename scalar_t, - int HEAD_SIZE, - int NUM_THREADS, - int PARTITION_SIZE> -__global__ void paged_attention_v2_reduce_kernel( - scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size] - const float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] - const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions] - const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] - const int* __restrict__ context_lens, // [num_seqs] - const int max_num_partitions) { - const int num_heads = gridDim.x; - const int head_idx = blockIdx.x; - const int seq_idx = blockIdx.y; - const int context_len = context_lens[seq_idx]; - const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); - if (num_partitions == 1) { - // No need to reduce. Only copy tmp_out to out. - scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE; - const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE - + head_idx * max_num_partitions * HEAD_SIZE; - for (int i = threadIdx.x; i < HEAD_SIZE; i += blockDim.x) { - out_ptr[i] = tmp_out_ptr[i]; - } - // Terminate the thread block. - return; - } - - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; - const int warp_idx = threadIdx.x / WARP_SIZE; - const int lane = threadIdx.x % WARP_SIZE; - - // Size: 2 * num_partitions. - extern __shared__ char shared_mem[]; - // Workspace for reduction. - __shared__ float red_smem[2 * NUM_WARPS]; - - // Load max logits to shared memory. - float* shared_max_logits = reinterpret_cast<float*>(shared_mem); - const float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions - + head_idx * max_num_partitions; - float max_logit = -FLT_MAX; - for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) { - const float l = max_logits_ptr[i]; - shared_max_logits[i] = l; - max_logit = fmaxf(max_logit, l); - } - __syncthreads(); - - // Get the global max logit. - // Reduce within the warp. -#pragma unroll - for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { - max_logit = fmaxf(max_logit, VLLM_SHFL_XOR_SYNC(max_logit, mask)); - } - if (lane == 0) { - red_smem[warp_idx] = max_logit; - } - __syncthreads(); - // Reduce across warps. - max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX; -#pragma unroll - for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { - max_logit = fmaxf(max_logit, VLLM_SHFL_XOR_SYNC(max_logit, mask)); - } - // Broadcast the max value to all threads. - max_logit = VLLM_SHFL_SYNC(max_logit, 0); - - // Load rescaled exp sums to shared memory. - float* shared_exp_sums = reinterpret_cast<float*>(shared_mem + sizeof(float) * num_partitions); - const float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions - + head_idx * max_num_partitions; - float global_exp_sum = 0.0f; - for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) { - float l = shared_max_logits[i]; - float rescaled_exp_sum = exp_sums_ptr[i] * expf(l - max_logit); - global_exp_sum += rescaled_exp_sum; - shared_exp_sums[i] = rescaled_exp_sum; - } - __syncthreads(); - global_exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], global_exp_sum); - const float inv_global_exp_sum = __fdividef(1.0f, global_exp_sum + 1e-6f); - - // Aggregate tmp_out to out. - const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE - + head_idx * max_num_partitions * HEAD_SIZE; - scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE; -#pragma unroll - for (int i = threadIdx.x; i < HEAD_SIZE; i += NUM_THREADS) { - float acc = 0.0f; - for (int j = 0; j < num_partitions; ++j) { - acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] * inv_global_exp_sum; - } - from_float(out_ptr[i], acc); - } -} - -} // namespace vllm - -#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ - VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ - ((void*)vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, \ - IS_FP8_E5M2_KV_CACHE>), shared_mem_size); \ - vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, \ - IS_FP8_E5M2_KV_CACHE><<<grid, block, shared_mem_size, stream>>>( \ - out_ptr, \ - query_ptr, \ - key_cache_ptr, \ - value_cache_ptr, \ - num_kv_heads, \ - scale, \ - block_tables_ptr, \ - context_lens_ptr, \ - max_num_blocks_per_seq, \ - alibi_slopes_ptr, \ - q_stride, \ - kv_block_stride, \ - kv_head_stride); - -// TODO(woosuk): Tune NUM_THREADS. -template< - typename T, - typename CACHE_T, - int BLOCK_SIZE, - bool IS_FP8_E5M2_KV_CACHE, - int NUM_THREADS = 128> -void paged_attention_v1_launcher( - torch::Tensor& out, - torch::Tensor& query, - torch::Tensor& key_cache, - torch::Tensor& value_cache, - int num_kv_heads, - float scale, - torch::Tensor& block_tables, - torch::Tensor& context_lens, - int max_context_len, - const c10::optional<torch::Tensor>& alibi_slopes) { - int num_seqs = query.size(0); - int num_heads = query.size(1); - int head_size = query.size(2); - int max_num_blocks_per_seq = block_tables.size(1); - int q_stride = query.stride(0); - int kv_block_stride = key_cache.stride(0); - int kv_head_stride = key_cache.stride(1); - - int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - - // NOTE: alibi_slopes is optional. - const float* alibi_slopes_ptr = alibi_slopes ? - reinterpret_cast<const float*>(alibi_slopes.value().data_ptr()) - : nullptr; - - T* out_ptr = reinterpret_cast<T*>(out.data_ptr()); - T* query_ptr = reinterpret_cast<T*>(query.data_ptr()); - CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr()); - CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr()); - int* block_tables_ptr = block_tables.data_ptr<int>(); - int* context_lens_ptr = context_lens.data_ptr<int>(); - - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; - int padded_max_context_len = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE) * BLOCK_SIZE; - int logits_size = padded_max_context_len * sizeof(float); - int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); - // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len - // Keep that in sync with the logic here! - int shared_mem_size = std::max(logits_size, outputs_size); - - dim3 grid(num_heads, num_seqs, 1); - dim3 block(NUM_THREADS); - const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - switch (head_size) { - // NOTE(woosuk): To reduce the compilation time, we only compile for the - // head sizes that we use in the model. However, we can easily extend this - // to support any head size which is a multiple of 16. - case 64: - LAUNCH_PAGED_ATTENTION_V1(64); - break; - case 80: - LAUNCH_PAGED_ATTENTION_V1(80); - break; - case 96: - LAUNCH_PAGED_ATTENTION_V1(96); - break; - case 112: - LAUNCH_PAGED_ATTENTION_V1(112); - break; - case 128: - LAUNCH_PAGED_ATTENTION_V1(128); - break; - case 256: - LAUNCH_PAGED_ATTENTION_V1(256); - break; - default: - TORCH_CHECK(false, "Unsupported head size: ", head_size); - break; - } -} - -#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE) \ - paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE>( \ - out, \ - query, \ - key_cache, \ - value_cache, \ - num_kv_heads, \ - scale, \ - block_tables, \ - context_lens, \ - max_context_len, \ - alibi_slopes); - -// NOTE(woosuk): To reduce the compilation time, we omitted block sizes -// 1, 2, 4, 64, 128, 256. -#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \ - switch (block_size) { \ - case 8: \ - CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE); \ - break; \ - case 16: \ - CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE); \ - break; \ - case 32: \ - CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ - } - -void paged_attention_v1( - torch::Tensor& out, // [num_seqs, num_heads, head_size] - torch::Tensor& query, // [num_seqs, num_heads, head_size] - torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size] - int num_kv_heads, // [num_heads] - float scale, - torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& context_lens, // [num_seqs] - int block_size, - int max_context_len, - const c10::optional<torch::Tensor>& alibi_slopes, - const std::string& kv_cache_dtype) { - if (kv_cache_dtype == "auto") { - if (query.dtype() == at::ScalarType::Float) { - CALL_V1_LAUNCHER_BLOCK_SIZE(float, float, false); - } else if (query.dtype() == at::ScalarType::Half) { - CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, false); - } else if (query.dtype() == at::ScalarType::BFloat16) { - CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, false); - } else { - TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); - } - } else if (kv_cache_dtype == "fp8_e5m2") { - if (query.dtype() == at::ScalarType::Float) { - CALL_V1_LAUNCHER_BLOCK_SIZE(float, uint8_t, true); - } else if (query.dtype() == at::ScalarType::Half) { - CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, true); - } else if (query.dtype() == at::ScalarType::BFloat16) { - CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, true); - } else { - TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); - } - } else { - TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype); - } -} - -#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ - vllm::paged_attention_v2_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS, \ - IS_FP8_E5M2_KV_CACHE, PARTITION_SIZE> \ - <<<grid, block, shared_mem_size, stream>>>( \ - exp_sums_ptr, \ - max_logits_ptr, \ - tmp_out_ptr, \ - query_ptr, \ - key_cache_ptr, \ - value_cache_ptr, \ - num_kv_heads, \ - scale, \ - block_tables_ptr, \ - context_lens_ptr, \ - max_num_blocks_per_seq, \ - alibi_slopes_ptr, \ - q_stride, \ - kv_block_stride, \ - kv_head_stride); \ - vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS, PARTITION_SIZE> \ - <<<reduce_grid, block, reduce_shared_mem_size, stream>>>( \ - out_ptr, \ - exp_sums_ptr, \ - max_logits_ptr, \ - tmp_out_ptr, \ - context_lens_ptr, \ - max_num_partitions); - -template< - typename T, - typename CACHE_T, - int BLOCK_SIZE, - bool IS_FP8_E5M2_KV_CACHE, - int NUM_THREADS = 128, - int PARTITION_SIZE = 512> -void paged_attention_v2_launcher( - torch::Tensor& out, - torch::Tensor& exp_sums, - torch::Tensor& max_logits, - torch::Tensor& tmp_out, - torch::Tensor& query, - torch::Tensor& key_cache, - torch::Tensor& value_cache, - int num_kv_heads, - float scale, - torch::Tensor& block_tables, - torch::Tensor& context_lens, - int max_context_len, - const c10::optional<torch::Tensor>& alibi_slopes) { - int num_seqs = query.size(0); - int num_heads = query.size(1); - int head_size = query.size(2); - int max_num_blocks_per_seq = block_tables.size(1); - int q_stride = query.stride(0); - int kv_block_stride = key_cache.stride(0); - int kv_head_stride = key_cache.stride(1); - - int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - - // NOTE: alibi_slopes is optional. - const float* alibi_slopes_ptr = alibi_slopes ? - reinterpret_cast<const float*>(alibi_slopes.value().data_ptr()) - : nullptr; - - T* out_ptr = reinterpret_cast<T*>(out.data_ptr()); - float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr()); - float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr()); - T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr()); - T* query_ptr = reinterpret_cast<T*>(query.data_ptr()); - CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr()); - CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr()); - int* block_tables_ptr = block_tables.data_ptr<int>(); - int* context_lens_ptr = context_lens.data_ptr<int>(); - - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; - int max_num_partitions = DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE); - int logits_size = PARTITION_SIZE * sizeof(float); - int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); - - // For paged attention v2 kernel. - dim3 grid(num_heads, num_seqs, max_num_partitions); - int shared_mem_size = std::max(logits_size, outputs_size); - // For paged attention v2 reduce kernel. - dim3 reduce_grid(num_heads, num_seqs); - int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float); - - dim3 block(NUM_THREADS); - const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - switch (head_size) { - // NOTE(woosuk): To reduce the compilation time, we only compile for the - // head sizes that we use in the model. However, we can easily extend this - // to support any head size which is a multiple of 16. - case 64: - LAUNCH_PAGED_ATTENTION_V2(64); - break; - case 80: - LAUNCH_PAGED_ATTENTION_V2(80); - break; - case 96: - LAUNCH_PAGED_ATTENTION_V2(96); - break; - case 112: - LAUNCH_PAGED_ATTENTION_V2(112); - break; - case 128: - LAUNCH_PAGED_ATTENTION_V2(128); - break; - case 256: - LAUNCH_PAGED_ATTENTION_V2(256); - break; - default: - TORCH_CHECK(false, "Unsupported head size: ", head_size); - break; - } -} - -#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE) \ - paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE>( \ - out, \ - exp_sums, \ - max_logits, \ - tmp_out, \ - query, \ - key_cache, \ - value_cache, \ - num_kv_heads, \ - scale, \ - block_tables, \ - context_lens, \ - max_context_len, \ - alibi_slopes); - -// NOTE(woosuk): To reduce the compilation time, we omitted block sizes -// 1, 2, 4, 64, 128, 256. -#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \ - switch (block_size) { \ - case 8: \ - CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE); \ - break; \ - case 16: \ - CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE); \ - break; \ - case 32: \ - CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ - } - -void paged_attention_v2( - torch::Tensor& out, // [num_seqs, num_heads, head_size] - torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions] - torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions] - torch::Tensor& tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] - torch::Tensor& query, // [num_seqs, num_heads, head_size] - torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size] - int num_kv_heads, // [num_heads] - float scale, - torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& context_lens, // [num_seqs] - int block_size, - int max_context_len, - const c10::optional<torch::Tensor>& alibi_slopes, - const std::string& kv_cache_dtype) { - if (kv_cache_dtype == "auto") { - if (query.dtype() == at::ScalarType::Float) { - CALL_V2_LAUNCHER_BLOCK_SIZE(float, float, false); - } else if (query.dtype() == at::ScalarType::Half) { - CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, false); - } else if (query.dtype() == at::ScalarType::BFloat16) { - CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, false); - } else { - TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); - } - } else if (kv_cache_dtype == "fp8_e5m2") { - if (query.dtype() == at::ScalarType::Float) { - CALL_V2_LAUNCHER_BLOCK_SIZE(float, uint8_t, true); - } else if (query.dtype() == at::ScalarType::Half) { - CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, true); - } else if (query.dtype() == at::ScalarType::BFloat16) { - CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, true); - } else { - TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); - } - } else { - TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype); - } -} - -#undef WARP_SIZE -#undef MAX -#undef MIN -#undef DIVIDE_ROUND_UP diff --git a/csrc/attention/attention_utils.cuh b/csrc/attention/attention_utils.cuh deleted file mode 100644 index ff64c4bd8f80c200647e688db1a74c711a9f709d..0000000000000000000000000000000000000000 --- a/csrc/attention/attention_utils.cuh +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp - * Copyright (c) 2023, The vLLM team. - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "../cuda_compat.h" -#include "attention_dtypes.h" - -#include <float.h> -#include <type_traits> - -namespace vllm { - -// Q*K^T operation. -template<int THREAD_GROUP_SIZE, typename Vec, int N> -inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) { - using A_vec = typename FloatVec<Vec>::Type; - // Compute the parallel products for Q*K^T (treat vector lanes separately). - A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]); -#pragma unroll - for (int ii = 1; ii < N; ++ii) { - qk_vec = fma(q[ii], k[ii], qk_vec); - } - - // Finalize the reduction across lanes. - float qk = sum(qk_vec); -#pragma unroll - for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) { - qk += VLLM_SHFL_XOR_SYNC(qk, mask); - } - return qk; -} - -template<typename T, int THREAD_GROUP_SIZE> -struct Qk_dot { - template<typename Vec, int N> - static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) { - return qk_dot_<THREAD_GROUP_SIZE>(q, k); - } -}; - -} // namespace vllm diff --git a/csrc/attention/dtype_bfloat16.cuh b/csrc/attention/dtype_bfloat16.cuh deleted file mode 100644 index 31e0cee01d2e1ff47b3381590c5cbc2c8446b556..0000000000000000000000000000000000000000 --- a/csrc/attention/dtype_bfloat16.cuh +++ /dev/null @@ -1,451 +0,0 @@ -/* - * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp - * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h - * Copyright (c) 2023, The vLLM team. - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "attention_generic.cuh" -#include "dtype_float32.cuh" - -#ifndef USE_ROCM - #include <cuda_bf16.h> - #include <cuda_fp16.h> -#else - #include <hip/hip_bf16.h> - #include <hip/hip_fp16.h> - - typedef __hip_bfloat162 __nv_bfloat162; - typedef __hip_bfloat16 __nv_bfloat16; -#endif - -#include <stdint.h> - -namespace vllm { - -// Define custom BF16 vector data types. -struct bf16_4_t { - __nv_bfloat162 x; - __nv_bfloat162 y; -}; - -struct bf16_8_t { - __nv_bfloat162 x; - __nv_bfloat162 y; - __nv_bfloat162 z; - __nv_bfloat162 w; -}; - -// BF16 vector types for Q, K, V. -template<> -struct Vec<__nv_bfloat16, 1> { - using Type = __nv_bfloat16; -}; -template<> -struct Vec<__nv_bfloat16, 2> { - using Type = __nv_bfloat162; -}; -template<> -struct Vec<__nv_bfloat16, 4> { - using Type = bf16_4_t; -}; -template<> -struct Vec<__nv_bfloat16, 8> { - using Type = bf16_8_t; -}; - -// FP32 accumulator vector types corresponding to Vec. -template<> -struct FloatVec<__nv_bfloat16> { - using Type = float; -}; -template<> -struct FloatVec<__nv_bfloat162> { - using Type = float2; -}; -template<> -struct FloatVec<bf16_4_t> { - using Type = Float4_; -}; -template<> -struct FloatVec<bf16_8_t> { - using Type = Float8_; -}; - -// Utility functions for type conversions. -inline __device__ float2 bf1622float2(const __nv_bfloat162 val) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 - assert(false); -#else - return __bfloat1622float2(val); -#endif -} - -inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 - assert(false); -#else - return __bfloat162bfloat162(val); -#endif -} - -// Vector addition. -inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 - assert(false); -#else - #ifndef USE_ROCM - return a + b; - #else - return __hadd(a, b); - #endif -#endif -} - -inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 - assert(false); -#else - return __hadd2(a, b); -#endif -} - -inline __device__ bf16_4_t add(bf16_4_t a, bf16_4_t b) { - bf16_4_t c; - c.x = add(a.x, b.x); - c.y = add(a.y, b.y); - return c; -} - -inline __device__ bf16_8_t add(bf16_8_t a, bf16_8_t b) { - bf16_8_t c; - c.x = add(a.x, b.x); - c.y = add(a.y, b.y); - c.z = add(a.z, b.z); - c.w = add(a.w, b.w); - return c; -} - -inline __device__ float2 add(__nv_bfloat162 a, float2 fb) { - float2 fa = bf1622float2(a); - return add(fa, fb); -} - -inline __device__ Float4_ add(bf16_4_t a, Float4_ fb) { - Float4_ fc; - fc.x = add(a.x, fb.x); - fc.y = add(a.y, fb.y); - return fc; -} - -inline __device__ Float8_ add(bf16_8_t a, Float8_ fb) { - Float8_ fc; - fc.x = add(a.x, fb.x); - fc.y = add(a.y, fb.y); - fc.z = add(a.z, fb.z); - fc.w = add(a.w, fb.w); - return fc; -} - -// Vector multiplication. -template<> -inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 - assert(false); -#else - return __hmul(a, b); -#endif -} - -template<> -inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 - assert(false); -#else - return __hmul2(a, b); -#endif -} - -template<> -inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b) { - return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b); -} - -template<> -inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) { - bf16_4_t c; - c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x); - c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.y, b.y); - return c; -} - -template<> -inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) { - __nv_bfloat162 s = bf162bf162(a); - bf16_4_t c; - c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.x); - c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.y); - return c; -} - -template<> -inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) { - bf16_8_t c; - c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x); - c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.y, b.y); - c.z = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.z, b.z); - c.w = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.w, b.w); - return c; -} - -template<> -inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) { - __nv_bfloat162 s = bf162bf162(a); - bf16_8_t c; - c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.x); - c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.y); - c.z = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.z); - c.w = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.w); - return c; -} - -template<> -inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b) { - float fa = __bfloat162float(a); - float fb = __bfloat162float(b); - return fa * fb; -} - -template<> -inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b) { - float2 fa = bf1622float2(a); - float2 fb = bf1622float2(b); - return mul<float2, float2, float2>(fa, fb); -} - -template<> -inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b) { - return mul<float2, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b); -} - -template<> -inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) { - Float4_ fc; - fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x); - fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.y, b.y); - return fc; -} - -template<> -inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) { - __nv_bfloat162 s = bf162bf162(a); - Float4_ fc; - fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.x); - fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.y); - return fc; -} - -template<> -inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) { - Float8_ fc; - fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x); - fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.y, b.y); - fc.z = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.z, b.z); - fc.w = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.w, b.w); - return fc; -} - -template<> -inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) { - __nv_bfloat162 s = bf162bf162(a); - Float8_ fc; - fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.x); - fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.y); - fc.z = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.z); - fc.w = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.w); - return fc; -} - -// Vector fused multiply-add. -inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 - assert(false); -#else - return __hfma2(a, b, c); -#endif -} - -inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b, __nv_bfloat162 c) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 - assert(false); -#else - return __hfma2(bf162bf162(a), b, c); -#endif -} - -inline __device__ bf16_4_t fma(bf16_4_t a, bf16_4_t b, bf16_4_t c) { - bf16_4_t d; - d.x = fma(a.x, b.x, c.x); - d.y = fma(a.y, b.y, c.y); - return d; -} - -inline __device__ bf16_4_t fma(__nv_bfloat16 a, bf16_4_t b, bf16_4_t c) { - __nv_bfloat162 s = bf162bf162(a); - bf16_4_t d; - d.x = fma(s, b.x, c.x); - d.y = fma(s, b.y, c.y); - return d; -} - -inline __device__ bf16_8_t fma(bf16_8_t a, bf16_8_t b, bf16_8_t c) { - bf16_8_t d; - d.x = fma(a.x, b.x, c.x); - d.y = fma(a.y, b.y, c.y); - d.z = fma(a.z, b.z, c.z); - d.w = fma(a.w, b.w, c.w); - return d; -} - -inline __device__ bf16_8_t fma(__nv_bfloat16 a, bf16_8_t b, bf16_8_t c) { - __nv_bfloat162 s = bf162bf162(a); - bf16_8_t d; - d.x = fma(s, b.x, c.x); - d.y = fma(s, b.y, c.y); - d.z = fma(s, b.z, c.z); - d.w = fma(s, b.w, c.w); - return d; -} - -inline __device__ float fma(__nv_bfloat16 a, __nv_bfloat16 b, float fc) { - return __bfloat162float(a) * __bfloat162float(b) + fc; -} - -inline __device__ float2 fma(__nv_bfloat162 a, __nv_bfloat162 b, float2 fc) { - float2 fa = bf1622float2(a); - float2 fb = bf1622float2(b); - return fma(fa, fb, fc); -} - -inline __device__ float2 fma(__nv_bfloat16 a, __nv_bfloat162 b, float2 fc) { - return fma(bf162bf162(a), b, fc); -} - -inline __device__ Float4_ fma(bf16_4_t a, bf16_4_t b, Float4_ fc) { - Float4_ fd; - fd.x = fma(a.x, b.x, fc.x); - fd.y = fma(a.y, b.y, fc.y); - return fd; -} - -inline __device__ Float4_ fma(__nv_bfloat16 a, bf16_4_t b, Float4_ fc) { - __nv_bfloat162 s = bf162bf162(a); - Float4_ fd; - fd.x = fma(s, b.x, fc.x); - fd.y = fma(s, b.y, fc.y); - return fd; -} - -inline __device__ Float8_ fma(bf16_8_t a, bf16_8_t b, Float8_ fc) { - Float8_ fd; - fd.x = fma(a.x, b.x, fc.x); - fd.y = fma(a.y, b.y, fc.y); - fd.z = fma(a.z, b.z, fc.z); - fd.w = fma(a.w, b.w, fc.w); - return fd; -} - -inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc) { - __nv_bfloat162 s = bf162bf162(a); - Float8_ fd; - fd.x = fma(s, b.x, fc.x); - fd.y = fma(s, b.y, fc.y); - fd.z = fma(s, b.z, fc.z); - fd.w = fma(s, b.w, fc.w); - return fd; -} - -// Vector sum. -template<> -inline __device__ float sum(__nv_bfloat16 v) { - return __bfloat162float(v); -} - -template<> -inline __device__ float sum(__nv_bfloat162 v) { - float2 vf = bf1622float2(v); - return vf.x + vf.y; -} - -template<> -inline __device__ float sum(bf16_4_t v) { - return sum(v.x) + sum(v.y); -} - -template<> -inline __device__ float sum(bf16_8_t v) { - return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w); -} - -// From float32 to bfloat16. -inline __device__ void from_float(__nv_bfloat16& dst, float src) { - dst = __float2bfloat16(src); -} - -inline __device__ void from_float(__nv_bfloat162& dst, float2 src) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 - assert(false); -#else - dst = __float22bfloat162_rn(src); -#endif -} - -inline __device__ void from_float(bf16_4_t& dst, Float4_ src) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 - assert(false); -#else - dst.x = __float22bfloat162_rn(src.x); - dst.y = __float22bfloat162_rn(src.y); -#endif -} - -inline __device__ void from_float(bf16_8_t& dst, Float8_ src) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 - assert(false); -#else - dst.x = __float22bfloat162_rn(src.x); - dst.y = __float22bfloat162_rn(src.y); - dst.z = __float22bfloat162_rn(src.z); - dst.w = __float22bfloat162_rn(src.w); -#endif -} - -// From bfloat16 to float32. -inline __device__ float to_float(__nv_bfloat16 u) { - return __bfloat162float(u); -} - -// Zero-out a variable. -inline __device__ void zero(__nv_bfloat16& dst) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 - assert(false); -#else - // Same as CUDART_ZERO_BF16 introduced in CUDA 12.2. - dst = __ushort_as_bfloat16((unsigned short)0x0000U); -#endif -} - -} // namespace vllm diff --git a/csrc/attention/dtype_float16.cuh b/csrc/attention/dtype_float16.cuh deleted file mode 100644 index d3271e69cd69d93abe03539604b17a380eb094b8..0000000000000000000000000000000000000000 --- a/csrc/attention/dtype_float16.cuh +++ /dev/null @@ -1,502 +0,0 @@ -/* - * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp - * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h - * Copyright (c) 2023, The vLLM team. - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "attention_generic.cuh" -#include "dtype_float32.cuh" - -#ifdef USE_ROCM - #include <hip/hip_fp16.h> -#endif - -#include <stdint.h> - -namespace vllm { - -// FP16 vector types for Q, K, V. -template<> -struct Vec<uint16_t, 1> { - using Type = uint16_t; -}; -template<> -struct Vec<uint16_t, 2> { - using Type = uint32_t; -}; -template<> -struct Vec<uint16_t, 4> { - using Type = uint2; -}; -template<> -struct Vec<uint16_t, 8> { - using Type = uint4; -}; - -// FP32 accumulator vector types corresponding to Vec. -template<> -struct FloatVec<uint16_t> { - using Type = float; -}; -template<> -struct FloatVec<uint32_t> { - using Type = float2; -}; -template<> -struct FloatVec<uint2> { - using Type = Float4_; -}; -template<> -struct FloatVec<uint4> { - using Type = Float8_; -}; - -// Utility functions for type conversions. -inline __device__ uint32_t h0_h0(uint16_t a) { -#ifndef USE_ROCM - uint32_t b; - asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a)); - return b; -#else - union { - uint32_t u32; - uint16_t u16[2]; - } tmp; - tmp.u16[0] = a; - tmp.u16[1] = a; - return tmp.u32; -#endif -} - -inline __device__ float half_to_float(uint16_t h) { - float f; -#ifndef USE_ROCM - asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h)); -#else - asm volatile("v_cvt_f32_f16 %0, %1;" : "=v"(f) : "v"(h)); -#endif - return f; -} - -inline __device__ float2 half2_to_float2(uint32_t v) { -#ifndef USE_ROCM - uint16_t lo, hi; - asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v)); - return make_float2(half_to_float(lo), half_to_float(hi)); -#else - union { - uint32_t u32; - uint16_t u16[2]; - } tmp; - tmp.u32 = v; - float2 ret; - ret.x = half_to_float(tmp.u16[0]); - ret.y = half_to_float(tmp.u16[1]); - return ret; -#endif -} - -inline __device__ uint16_t float_to_half(float f) { - union { - uint32_t u32; - uint16_t u16[2]; - } tmp; -#ifndef USE_ROCM - asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f)); -#else - asm volatile("v_cvt_f16_f32 %0, %1;\n" : "=v"(tmp.u32) : "v"(f)); -#endif - return tmp.u16[0]; -} - -inline __device__ uint32_t float2_to_half2(float2 f) { - union { - uint32_t u32; - uint16_t u16[2]; - } tmp; -#ifndef USE_ROCM - #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 - asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x)); - #else - asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x)); - asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y)); - #endif -#else - tmp.u16[0] = float_to_half(f.x); - tmp.u16[1] = float_to_half(f.y); -#endif - return tmp.u32; -} - -// Vector addition. -inline __device__ uint16_t add(uint16_t a, uint16_t b) { - uint16_t c; -#ifndef USE_ROCM - asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b)); -#else - asm volatile("v_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b)); -#endif - return c; -} - -inline __device__ uint32_t add(uint32_t a, uint32_t b) { - uint32_t c; -#ifndef USE_ROCM - asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b)); -#else - asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b)); -#endif - return c; -} - -inline __device__ uint2 add(uint2 a, uint2 b) { - uint2 c; - c.x = add(a.x, b.x); - c.y = add(a.y, b.y); - return c; -} - -inline __device__ uint4 add(uint4 a, uint4 b) { - uint4 c; - c.x = add(a.x, b.x); - c.y = add(a.y, b.y); - c.z = add(a.z, b.z); - c.w = add(a.w, b.w); - return c; -} - -inline __device__ float2 add(uint32_t a, float2 fb) { - float2 fa = half2_to_float2(a); - return add(fa, fb); -} - -inline __device__ Float4_ add(uint2 a, Float4_ fb) { - Float4_ fc; - fc.x = add(a.x, fb.x); - fc.y = add(a.y, fb.y); - return fc; -} - -inline __device__ Float8_ add(uint4 a, Float8_ fb) { - Float8_ fc; - fc.x = add(a.x, fb.x); - fc.y = add(a.y, fb.y); - fc.z = add(a.z, fb.z); - fc.w = add(a.w, fb.w); - return fc; -} - -// Vector multiplication. -template<> -inline __device__ uint16_t mul(uint16_t a, uint16_t b) { - uint16_t c; -#ifndef USE_ROCM - asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b)); -#else - asm volatile("v_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b)); -#endif - return c; -} - -template<> -inline __device__ uint32_t mul(uint32_t a, uint32_t b) { - uint32_t c; -#ifndef USE_ROCM - asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b)); -#else - asm volatile("v_pk_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b)); -#endif - return c; -} - -template<> -inline __device__ uint32_t mul(uint16_t a, uint32_t b) { - return mul<uint32_t, uint32_t, uint32_t>(h0_h0(a), b); -} - -template<> -inline __device__ uint2 mul(uint2 a, uint2 b) { - uint2 c; - c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x); - c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y); - return c; -} - -template<> -inline __device__ uint2 mul(uint16_t a, uint2 b) { - uint32_t s = h0_h0(a); - uint2 c; - c.x = mul<uint32_t, uint32_t, uint32_t>(s, b.x); - c.y = mul<uint32_t, uint32_t, uint32_t>(s, b.y); - return c; -} - -template<> -inline __device__ uint4 mul(uint4 a, uint4 b) { - uint4 c; - c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x); - c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y); - c.z = mul<uint32_t, uint32_t, uint32_t>(a.z, b.z); - c.w = mul<uint32_t, uint32_t, uint32_t>(a.w, b.w); - return c; -} - -template<> -inline __device__ uint4 mul(uint16_t a, uint4 b) { - uint32_t s = h0_h0(a); - uint4 c; - c.x = mul<uint32_t, uint32_t, uint32_t>(s, b.x); - c.y = mul<uint32_t, uint32_t, uint32_t>(s, b.y); - c.z = mul<uint32_t, uint32_t, uint32_t>(s, b.z); - c.w = mul<uint32_t, uint32_t, uint32_t>(s, b.w); - return c; -} - -template<> -inline __device__ float mul(uint16_t a, uint16_t b) { - float fa = half_to_float(a); - float fb = half_to_float(b); - return fa * fb; -} - -template<> -inline __device__ float2 mul(uint32_t a, uint32_t b) { - float2 fa = half2_to_float2(a); - float2 fb = half2_to_float2(b); - return mul<float2, float2, float2>(fa, fb); -} - -template<> -inline __device__ float2 mul(uint16_t a, uint32_t b) { - return mul<float2, uint32_t, uint32_t>(h0_h0(a), b); -} - -template<> -inline __device__ Float4_ mul(uint2 a, uint2 b) { - Float4_ fc; - fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x); - fc.y = mul<float2, uint32_t, uint32_t>(a.y, b.y); - return fc; -} - -template<> -inline __device__ Float4_ mul(uint16_t a, uint2 b) { - uint32_t s = h0_h0(a); - Float4_ fc; - fc.x = mul<float2, uint32_t, uint32_t>(s, b.x); - fc.y = mul<float2, uint32_t, uint32_t>(s, b.y); - return fc; -} - -template<> -inline __device__ Float8_ mul(uint4 a, uint4 b) { - Float8_ fc; - fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x); - fc.y = mul<float2, uint32_t, uint32_t>(a.y, b.y); - fc.z = mul<float2, uint32_t, uint32_t>(a.z, b.z); - fc.w = mul<float2, uint32_t, uint32_t>(a.w, b.w); - return fc; -} - -template<> -inline __device__ Float8_ mul(uint16_t a, uint4 b) { - uint32_t s = h0_h0(a); - Float8_ fc; - fc.x = mul<float2, uint32_t, uint32_t>(s, b.x); - fc.y = mul<float2, uint32_t, uint32_t>(s, b.y); - fc.z = mul<float2, uint32_t, uint32_t>(s, b.z); - fc.w = mul<float2, uint32_t, uint32_t>(s, b.w); - return fc; -} - -// Vector fused multiply-add. -inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) { - uint32_t d; -#ifndef USE_ROCM - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c)); -#else - asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c)); -#endif - return d; -} - -inline __device__ uint32_t fma(uint16_t a, uint32_t b, uint32_t c) { - return fma(h0_h0(a), b, c); -} - -inline __device__ uint2 fma(uint2 a, uint2 b, uint2 c) { - uint2 d; - d.x = fma(a.x, b.x, c.x); - d.y = fma(a.y, b.y, c.y); - return d; -} - -inline __device__ uint2 fma(uint16_t a, uint2 b, uint2 c) { - uint32_t s = h0_h0(a); - uint2 d; - d.x = fma(s, b.x, c.x); - d.y = fma(s, b.y, c.y); - return d; -} - -inline __device__ uint4 fma(uint4 a, uint4 b, uint4 c) { - uint4 d; - d.x = fma(a.x, b.x, c.x); - d.y = fma(a.y, b.y, c.y); - d.z = fma(a.z, b.z, c.z); - d.w = fma(a.w, b.w, c.w); - return d; -} - -inline __device__ uint4 fma(uint16_t a, uint4 b, uint4 c) { - uint32_t s = h0_h0(a); - uint4 d; - d.x = fma(s, b.x, c.x); - d.y = fma(s, b.y, c.y); - d.z = fma(s, b.z, c.z); - d.w = fma(s, b.w, c.w); - return d; -} - -inline __device__ float fma(uint16_t a, uint16_t b, float fc) { - float fa = half_to_float(a); - float fb = half_to_float(b); - return fa * fb + fc; -} - -inline __device__ float2 fma(uint32_t a, uint32_t b, float2 fc) { - float2 fa = half2_to_float2(a); - float2 fb = half2_to_float2(b); - return fma(fa, fb, fc); -} - -inline __device__ float2 fma(uint16_t a, uint32_t b, float2 fc) { - return fma(h0_h0(a), b, fc); -} - -inline __device__ Float4_ fma(uint2 a, uint2 b, Float4_ fc) { - Float4_ fd; - fd.x = fma(a.x, b.x, fc.x); - fd.y = fma(a.y, b.y, fc.y); - return fd; -} - -inline __device__ Float4_ fma(uint16_t a, uint2 b, Float4_ fc) { - uint32_t s = h0_h0(a); - Float4_ fd; - fd.x = fma(s, b.x, fc.x); - fd.y = fma(s, b.y, fc.y); - return fd; -} - -inline __device__ Float8_ fma(uint4 a, uint4 b, Float8_ fc) { - Float8_ fd; - fd.x = fma(a.x, b.x, fc.x); - fd.y = fma(a.y, b.y, fc.y); - fd.z = fma(a.z, b.z, fc.z); - fd.w = fma(a.w, b.w, fc.w); - return fd; -} - -inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc) { - uint32_t s = h0_h0(a); - Float8_ fd; - fd.x = fma(s, b.x, fc.x); - fd.y = fma(s, b.y, fc.y); - fd.z = fma(s, b.z, fc.z); - fd.w = fma(s, b.w, fc.w); - return fd; -} - -// Vector sum. -template<> -inline __device__ float sum(uint16_t v) { - return half_to_float(v); -} - -template<> -inline __device__ float sum(uint32_t v) { - float2 tmp = half2_to_float2(v); - return tmp.x + tmp.y; -} - -template<> -inline __device__ float sum(uint2 v) { - uint32_t c = add(v.x, v.y); - return sum(c); -} - -template<> -inline __device__ float sum(uint4 v) { - uint32_t c = add(v.x, v.y); - c = add(c, v.z); - c = add(c, v.w); - return sum(c); -} - -// From float32 to float16. -inline __device__ void from_float(uint16_t& dst, float src) { - dst = float_to_half(src); -} - -inline __device__ void from_float(uint32_t& dst, float2 src) { - dst = float2_to_half2(src); -} - -inline __device__ void from_float(uint2& dst, Float4_ src) { - dst.x = float2_to_half2(src.x); - dst.y = float2_to_half2(src.y); -} - -inline __device__ void from_float(uint4& dst, Float8_ src) { - dst.x = float2_to_half2(src.x); - dst.y = float2_to_half2(src.y); - dst.z = float2_to_half2(src.z); - dst.w = float2_to_half2(src.w); -} - -// From float16 to float32. -inline __device__ float to_float(uint16_t u) { - return half_to_float(u); -} - -inline __device__ float2 to_float(uint32_t u) { - return half2_to_float2(u); -} - -inline __device__ Float4_ to_float(uint2 u) { - Float4_ tmp; - tmp.x = half2_to_float2(u.x); - tmp.y = half2_to_float2(u.y); - return tmp; -} - -inline __device__ Float8_ to_float(uint4 u) { - Float8_ tmp; - tmp.x = half2_to_float2(u.x); - tmp.y = half2_to_float2(u.y); - tmp.z = half2_to_float2(u.z); - tmp.w = half2_to_float2(u.w); - return tmp; -} - -// Zero-out a variable. -inline __device__ void zero(uint16_t& dst) { - dst = uint16_t(0); -} - -} // namespace vllm diff --git a/csrc/attention/dtype_float32.cuh b/csrc/attention/dtype_float32.cuh deleted file mode 100644 index b200d2d226eb04792ec3d18a48a5210c40a2d92b..0000000000000000000000000000000000000000 --- a/csrc/attention/dtype_float32.cuh +++ /dev/null @@ -1,273 +0,0 @@ -/* - * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp - * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h - * Copyright (c) 2023, The vLLM team. - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "attention_generic.cuh" - -#include <stdint.h> - -namespace vllm { - -// Define custom FP32 vector data types. -struct Float4_ { - float2 x; - float2 y; -}; - -struct Float8_ { - float2 x; - float2 y; - float2 z; - float2 w; -}; - -// FP32 vector types for Q, K, V. -template<> -struct Vec<float, 1> { - using Type = float; -}; -template<> -struct Vec<float, 2> { - using Type = float2; -}; -template<> -struct Vec<float, 4> { - using Type = float4; -}; - -// FP32 accumulator vector types corresponding to Vec. -template<> -struct FloatVec<float> { - using Type = float; -}; -template<> -struct FloatVec<float2> { - using Type = float2; -}; -template<> -struct FloatVec<float4> { - using Type = float4; -}; - -// Vector addition. -inline __device__ float add(float a, float b) { - return a + b; -} - -inline __device__ float2 add(float2 a, float2 b) { - float2 c; - c.x = add(a.x, b.x); - c.y = add(a.y, b.y); - return c; -} - -inline __device__ float4 add(float4 a, float4 b) { - float4 c; - c.x = add(a.x, b.x); - c.y = add(a.y, b.y); - c.z = add(a.z, b.z); - c.w = add(a.w, b.w); - return c; -} - -// Vector multiplication. -template<> -inline __device__ float mul<float, float>(float a, float b) { - return a * b; -} - -template<> -inline __device__ float2 mul(float2 a, float2 b) { - float2 c; - c.x = a.x * b.x; - c.y = a.y * b.y; - return c; -} - -template<> -inline __device__ float2 mul(float a, float2 b) { - float2 c; - c.x = a * b.x; - c.y = a * b.y; - return c; -} - -template<> -inline __device__ float4 mul(float4 a, float4 b) { - float4 c; - c.x = a.x * b.x; - c.y = a.y * b.y; - c.z = a.z * b.z; - c.w = a.w * b.w; - return c; -} - -template<> -inline __device__ float4 mul(float a, float4 b) { - float4 c; - c.x = a * b.x; - c.y = a * b.y; - c.z = a * b.z; - c.w = a * b.w; - return c; -} - -// Vector fused multiply-add. -inline __device__ float fma(float a, float b, float c) { - return a * b + c; -} - -inline __device__ float2 fma(float2 a, float2 b, float2 c) { - float2 d; - d.x = fma(a.x, b.x, c.x); - d.y = fma(a.y, b.y, c.y); - return d; -} - -inline __device__ float2 fma(float a, float2 b, float2 c) { - float2 d; - d.x = fma(a, b.x, c.x); - d.y = fma(a, b.y, c.y); - return d; -} - -inline __device__ float4 fma(float4 a, float4 b, float4 c) { - float4 d; - d.x = fma(a.x, b.x, c.x); - d.y = fma(a.y, b.y, c.y); - d.z = fma(a.z, b.z, c.z); - d.w = fma(a.w, b.w, c.w); - return d; -} - -inline __device__ float4 fma(float a, float4 b, float4 c) { - float4 d; - d.x = fma(a, b.x, c.x); - d.y = fma(a, b.y, c.y); - d.z = fma(a, b.z, c.z); - d.w = fma(a, b.w, c.w); - return d; -} - -inline __device__ Float4_ fma(float a, Float4_ b, Float4_ c) { - Float4_ d; - d.x = fma(a, b.x, c.x); - d.y = fma(a, b.y, c.y); - return d; -} - -inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) { - Float8_ d; - d.x = fma(a, b.x, c.x); - d.y = fma(a, b.y, c.y); - d.z = fma(a, b.z, c.z); - d.w = fma(a, b.w, c.w); - return d; -} - -// Vector sum. -template<> -inline __device__ float sum(float v) { - return v; -} - -template<> -inline __device__ float sum(float2 v) { - return v.x + v.y; -} - -template<> -inline __device__ float sum(float4 v) { - return v.x + v.y + v.z + v.w; -} - -template<> -inline __device__ float sum(Float4_ v) { - return v.x.x + v.x.y + v.y.x + v.y.y; -} - -template<> -inline __device__ float sum(Float8_ v) { - return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y; -} - -// Vector dot product. -inline __device__ float dot(float a, float b) { - return a * b; -} - -inline __device__ float dot(float2 a, float2 b) { - float2 c = mul<float2, float2, float2>(a, b); - return c.x + c.y; -} - -inline __device__ float dot(Float4_ a, Float4_ b) { - float2 acc = mul<float2, float2, float2>(a.x, b.x); - acc = fma(a.y, b.y, acc); - return acc.x + acc.y; -} - -inline __device__ float dot(Float8_ a, Float8_ b) { - float2 acc = mul<float2, float2, float2>(a.x, b.x); - acc = fma(a.y, b.y, acc); - acc = fma(a.z, b.z, acc); - acc = fma(a.w, b.w, acc); - return acc.x + acc.y; -} - -// From float to float. -inline __device__ void from_float(float& dst, float src) { - dst = src; -} - -inline __device__ void from_float(float2& dst, float2 src) { - dst = src; -} - -inline __device__ void from_float(float4& dst, float4 src) { - dst = src; -} - -// From float to float. -inline __device__ float to_float(float u) { - return u; -} - -inline __device__ float2 to_float(float2 u) { - return u; -} - -inline __device__ float4 to_float(float4 u) { - return u; -} - -inline __device__ Float4_ to_float(Float4_ u) { - return u; -} - -inline __device__ Float8_ to_float(Float8_ u) { - return u; -} - -// Zero-out a variable. -inline __device__ void zero(float& dst) { - dst = 0.f; -} - -} // namespace vllm diff --git a/csrc/attention/dtype_fp8_e5m2.cuh b/csrc/attention/dtype_fp8_e5m2.cuh deleted file mode 100644 index 0580fbb8e863f74f27e6499a86d4fcf92e462514..0000000000000000000000000000000000000000 --- a/csrc/attention/dtype_fp8_e5m2.cuh +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include "attention_generic.cuh" - -#include <stdint.h> -#ifdef ENABLE_FP8_E5M2 -#include <cuda_fp8.h> -#endif - -namespace vllm { -#ifdef ENABLE_FP8_E5M2 -// fp8 vector types for quantization of kv cache - -template<> -struct Vec<uint8_t, 1> { - using Type = uint8_t; -}; - -template<> -struct Vec<uint8_t, 2> { - using Type = uint16_t; -}; - -template<> -struct Vec<uint8_t, 4> { - using Type = uint32_t; -}; - -template<> -struct Vec<uint8_t, 8> { - using Type = uint2; -}; -#endif // ENABLE_FP8_E5M2 - -} // namespace vllm diff --git a/csrc/cache.h b/csrc/cache.h deleted file mode 100644 index 21c71830f7942cc90cd33fe0dd7c3e0fcc6732b6..0000000000000000000000000000000000000000 --- a/csrc/cache.h +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once - -#include <torch/extension.h> - -#include <map> -#include <vector> - -void swap_blocks( - torch::Tensor& src, - torch::Tensor& dst, - const std::map<int64_t, int64_t>& block_mapping); - -void copy_blocks( - std::vector<torch::Tensor>& key_caches, - std::vector<torch::Tensor>& value_caches, - const std::map<int64_t, std::vector<int64_t>>& block_mapping); - -void reshape_and_cache( - torch::Tensor& key, - torch::Tensor& value, - torch::Tensor& key_cache, - torch::Tensor& value_cache, - torch::Tensor& slot_mapping, - const std::string& kv_cache_dtype); - -void gather_cached_kv( - torch::Tensor& key, - torch::Tensor& value, - torch::Tensor& key_cache, - torch::Tensor& value_cache, - torch::Tensor& slot_mapping); - -// Just for unittest -void convert_fp8_e5m2( - torch::Tensor& src_cache, - torch::Tensor& dst_cache); diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu deleted file mode 100644 index fe0159e40458502db6d47a95ee598025f111716b..0000000000000000000000000000000000000000 --- a/csrc/cache_kernels.cu +++ /dev/null @@ -1,474 +0,0 @@ -#include <torch/extension.h> -#include <ATen/cuda/CUDAContext.h> -#include <c10/cuda/CUDAGuard.h> - -#include "cuda_compat.h" -#include "dispatch_utils.h" -#include "quantization/fp8_e5m2_kvcache/quant_utils.cuh" - -#include <algorithm> -#include <cassert> -#include <map> -#include <vector> - -void swap_blocks( - torch::Tensor& src, - torch::Tensor& dst, - const std::map<int64_t, int64_t>& block_mapping) { - torch::Device src_device = src.device(); - torch::Device dst_device = dst.device(); - cudaMemcpyKind memcpy_type; - if (src_device.is_cuda() && dst_device.is_cuda()) { - TORCH_CHECK( - src_device.index() == dst_device.index(), - "src and dst must be on the same GPU"); - memcpy_type = cudaMemcpyDeviceToDevice; - } else if (src_device.is_cuda() && dst_device.is_cpu()) { - memcpy_type = cudaMemcpyDeviceToHost; - } else if (src_device.is_cpu() && dst_device.is_cuda()) { - memcpy_type = cudaMemcpyHostToDevice; - } else { - TORCH_CHECK(false, "Invalid device combination"); - } - - char *src_ptr = static_cast<char*>(src.data_ptr()); - char *dst_ptr = static_cast<char*>(dst.data_ptr()); - - const int64_t block_size_in_bytes = src.element_size() * src[0].numel(); - const at::cuda::OptionalCUDAGuard device_guard(src_device.is_cuda() ? src_device : dst_device); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - // NOTE(woosuk): This can be slow if the number of blocks is large. - for (const auto& pair : block_mapping) { - int64_t src_block_number = pair.first; - int64_t dst_block_number = pair.second; - int64_t src_offset = src_block_number * block_size_in_bytes; - int64_t dst_offset = dst_block_number * block_size_in_bytes; - cudaMemcpyAsync( - dst_ptr + dst_offset, - src_ptr + src_offset, - block_size_in_bytes, - memcpy_type, - stream); - } -} - -namespace vllm { - -// Grid: (num_layers, num_pairs) -template<typename scalar_t> -__global__ void copy_blocks_kernel( - int64_t* key_cache_ptrs, - int64_t* value_cache_ptrs, - const int64_t* __restrict__ block_mapping, - const int numel_per_block) { - const int layer_idx = blockIdx.x; - const int pair_idx = blockIdx.y; - - scalar_t* key_cache = reinterpret_cast<scalar_t*>(key_cache_ptrs[layer_idx]); - scalar_t* value_cache = reinterpret_cast<scalar_t*>(value_cache_ptrs[layer_idx]); - int64_t src_block_number = block_mapping[2 * pair_idx]; - int64_t dst_block_number = block_mapping[2 * pair_idx + 1]; - - const int64_t src_block_offset = src_block_number * numel_per_block; - const int64_t dst_block_offset = dst_block_number * numel_per_block; - for (int i = threadIdx.x; i < numel_per_block; i += blockDim.x) { - int64_t src_offset = src_block_offset + i; - int64_t dst_offset = dst_block_offset + i; - key_cache[dst_offset] = key_cache[src_offset]; - } - for (int i = threadIdx.x; i < numel_per_block; i += blockDim.x) { - int64_t src_offset = src_block_offset + i; - int64_t dst_offset = dst_block_offset + i; - value_cache[dst_offset] = value_cache[src_offset]; - } -} - -} // namespace vllm - -void copy_blocks( - std::vector<torch::Tensor>& key_caches, - std::vector<torch::Tensor>& value_caches, - const std::map<int64_t, std::vector<int64_t>>& block_mapping) { - int num_layers = key_caches.size(); - TORCH_CHECK(num_layers == value_caches.size()); - if (num_layers == 0) { - return; - } - torch::Device cache_device = key_caches[0].device(); - TORCH_CHECK(cache_device.is_cuda()); - - // Create data structures for the kernel. - // Create an array of pointers to the key and value caches. - int64_t key_cache_ptrs[num_layers]; - int64_t value_cache_ptrs[num_layers]; - for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) { - key_cache_ptrs[layer_idx] = reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr()); - value_cache_ptrs[layer_idx] = reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr()); - } - // Create block mapping array. - std::vector<int64_t> block_mapping_vec; - for (const auto& pair : block_mapping) { - int64_t src_block_number = pair.first; - for (int64_t dst_block_number : pair.second) { - block_mapping_vec.push_back(src_block_number); - block_mapping_vec.push_back(dst_block_number); - } - } - int64_t* block_mapping_array = block_mapping_vec.data(); - int num_pairs = block_mapping_vec.size() / 2; - - // Move the data structures to the GPU. - // NOTE: This synchronizes the CPU and GPU. - torch::Tensor key_cache_ptrs_tensor = torch::from_blob( - key_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device); - torch::Tensor value_cache_ptrs_tensor = torch::from_blob( - value_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device); - torch::Tensor block_mapping_tensor = torch::from_blob( - block_mapping_array, {2 * num_pairs}, torch::kInt64).to(cache_device); - - // Launch the kernel. - const int numel_per_block = key_caches[0][0].numel(); - dim3 grid(num_layers, num_pairs); - dim3 block(std::min(1024, numel_per_block)); - const at::cuda::OptionalCUDAGuard device_guard(cache_device); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES( - key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] { - vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>( - key_cache_ptrs_tensor.data_ptr<int64_t>(), - value_cache_ptrs_tensor.data_ptr<int64_t>(), - block_mapping_tensor.data_ptr<int64_t>(), - numel_per_block); - })); -} - -namespace vllm { - -template<typename scalar_t, typename cache_t, bool is_fp8_e5m2_kv_cache> -__global__ void reshape_and_cache_kernel( - const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] - const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size] - cache_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - cache_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size] - const int64_t* __restrict__ slot_mapping, // [num_tokens] - const int key_stride, - const int value_stride, - const int num_heads, - const int head_size, - const int block_size, - const int x) { - const int64_t token_idx = blockIdx.x; - const int64_t slot_idx = slot_mapping[token_idx]; - if (slot_idx < 0) { - // Padding token that should be ignored. - return; - } - - const int64_t block_idx = slot_idx / block_size; - const int64_t block_offset = slot_idx % block_size; - - const int n = num_heads * head_size; - for (int i = threadIdx.x; i < n; i += blockDim.x) { - const int64_t src_key_idx = token_idx * key_stride + i; - const int64_t src_value_idx = token_idx * value_stride + i; - - const int head_idx = i / head_size; - const int head_offset = i % head_size; - const int x_idx = head_offset / x; - const int x_offset = head_offset % x; - - const int64_t tgt_key_idx = block_idx * num_heads * (head_size / x) * block_size * x - + head_idx * (head_size / x) * block_size * x - + x_idx * block_size * x - + block_offset * x - + x_offset; - const int64_t tgt_value_idx = block_idx * num_heads * head_size * block_size - + head_idx * head_size * block_size - + head_offset * block_size - + block_offset; - scalar_t tgt_key = key[src_key_idx]; - scalar_t tgt_value = value[src_value_idx]; - if constexpr (is_fp8_e5m2_kv_cache) { -#ifdef ENABLE_FP8_E5M2 - key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion<uint8_t, scalar_t>(tgt_key); - value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion<uint8_t, scalar_t>(tgt_value); -#else - assert(false); -#endif - } else { - key_cache[tgt_key_idx] = tgt_key; - value_cache[tgt_value_idx] = tgt_value; - } - } -} - -} // namespace vllm - -#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \ - vllm::reshape_and_cache_kernel<KV_T, CACHE_T, IS_FP8_E5M2_KV_CACHE><<<grid, block, 0, stream>>>( \ - reinterpret_cast<KV_T*>(key.data_ptr()), \ - reinterpret_cast<KV_T*>(value.data_ptr()), \ - reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \ - reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \ - slot_mapping.data_ptr<int64_t>(), \ - key_stride, \ - value_stride, \ - num_heads, \ - head_size, \ - block_size, \ - x); - -void reshape_and_cache( - torch::Tensor& key, // [num_tokens, num_heads, head_size] - torch::Tensor& value, // [num_tokens, num_heads, head_size] - torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size] - torch::Tensor& slot_mapping, // [num_tokens] - const std::string& kv_cache_dtype) -{ - int num_tokens = key.size(0); - int num_heads = key.size(1); - int head_size = key.size(2); - int block_size = key_cache.size(3); - int x = key_cache.size(4); - - int key_stride = key.stride(0); - int value_stride = value.stride(0); - - dim3 grid(num_tokens); - dim3 block(std::min(num_heads * head_size, 512)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(key)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - if (kv_cache_dtype == "auto") { - if (key.dtype() == at::ScalarType::Float) { - CALL_RESHAPE_AND_CACHE(float, float, false); - } else if (key.dtype() == at::ScalarType::Half) { - CALL_RESHAPE_AND_CACHE(uint16_t, uint16_t, false); - } else if (key.dtype() == at::ScalarType::BFloat16) { - CALL_RESHAPE_AND_CACHE(__nv_bfloat16, __nv_bfloat16, false); - } - } else if (kv_cache_dtype == "fp8_e5m2") { - if (key.dtype() == at::ScalarType::Float) { - CALL_RESHAPE_AND_CACHE(float, uint8_t, true); - } else if (key.dtype() == at::ScalarType::Half) { - CALL_RESHAPE_AND_CACHE(uint16_t, uint8_t, true); - } else if (key.dtype() == at::ScalarType::BFloat16) { - CALL_RESHAPE_AND_CACHE(__nv_bfloat16, uint8_t, true); - } - } else { - TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype); - } -} - -namespace vllm { - -// Grid: (num_blocks, block_size). -template<typename scalar_t> -__global__ void gather_cached_kv_kernel( - scalar_t* __restrict__ key, // [num_tokens, [stride], num_heads, head_size] - scalar_t* __restrict__ value, // [num_tokens, [stride], num_heads, head_size] - const scalar_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - const scalar_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size] - const int* __restrict__ slot_mapping, // [num_tokens] - const int key_stride, - const int value_stride, - const int num_heads, - const int head_size, - const int block_size, - const int x) { - const int token_idx = blockIdx.x; - const int slot_idx = slot_mapping[token_idx]; - const int block_idx = slot_idx / block_size; - const int block_offset = slot_idx % block_size; - - const int num_tokens = num_heads * head_size; - for (int i = threadIdx.x; i < num_tokens; i += blockDim.x) { - const int tgt_key_idx = token_idx * key_stride + i; - const int tgt_value_idx = token_idx * value_stride + i; - - const int head_idx = i / head_size; - const int head_offset = i % head_size; - const int x_idx = head_offset / x; // the offset of the [head_size/x] dimension - const int x_offset = head_offset % x; - - const int src_key_idx = block_idx * num_heads * (head_size / x) * block_size * x - + head_idx * (head_size / x) * block_size * x - + x_idx * block_size * x - + block_offset * x - + x_offset; - const int src_value_idx = block_idx * num_heads * head_size * block_size - + head_idx * head_size * block_size - + head_offset * block_size - + block_offset; - - key[tgt_key_idx] = VLLM_LDG(&key_cache[src_key_idx]); - value[tgt_value_idx] = VLLM_LDG(&value_cache[src_value_idx]); - } -} - -template <typename scalar_t> -__global__ void gather_cached_kv_kernel_optimized( - scalar_t *__restrict__ key, // [num_tokens, [stride], num_heads, head_size] - scalar_t *__restrict__ value, // [num_tokens, [stride], num_heads, head_size] - const scalar_t *__restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - const scalar_t *__restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size] - const int *__restrict__ slot_mapping, // [num_tokens] - const int key_stride, - const int value_stride, - const int num_heads, - const int head_size, - const int block_size, - const int x) -{ - const int token_idx = blockIdx.x; - const int slot_idx = slot_mapping[token_idx]; - const int block_idx = slot_idx / block_size; - const int block_offset = slot_idx % block_size; - - const int dim = num_heads * head_size; - assert(dim % 4 == 0); // this is true for known use cases - const int unroll_factor = 4; - const int unrolled_dim = dim / unroll_factor; - - for (int i = threadIdx.x; i < unrolled_dim; i += blockDim.x) - { - int tgt_key_indices[unroll_factor]; - int tgt_value_indices[unroll_factor]; - int src_key_indices[unroll_factor]; - int src_value_indices[unroll_factor]; - scalar_t keys_to_store[unroll_factor]; - scalar_t values_to_store[unroll_factor]; - - #pragma unroll - for (int j = 0; j < unroll_factor; ++j) - { - int index = i + j * unrolled_dim; - - const int tgt_key_idx = token_idx * key_stride + index; - const int tgt_value_idx = token_idx * value_stride + index; - - const int head_idx = index / head_size; - const int head_offset = index % head_size; - const int x_idx = head_offset / x; - const int x_offset = head_offset % x; - - const int src_key_idx = block_idx * num_heads * (head_size / x) * block_size * x - + head_idx * (head_size / x) * block_size * x - + x_idx * block_size * x - + block_offset * x - + x_offset; - const int src_value_idx = block_idx * num_heads * head_size * block_size - + head_idx * head_size * block_size - + head_offset * block_size - + block_offset; - - tgt_key_indices[j] = tgt_key_idx; - tgt_value_indices[j] = tgt_value_idx; - src_key_indices[j] = src_key_idx; - src_value_indices[j] = src_value_idx; - - keys_to_store[j] = VLLM_LDG(&key_cache[src_key_idx]); - values_to_store[j] = VLLM_LDG(&value_cache[src_value_idx]); - } - - #pragma unroll - for (int j = 0; j < unroll_factor; ++j) - { - key[tgt_key_indices[j]] = keys_to_store[j]; - value[tgt_value_indices[j]] = values_to_store[j]; - } - } -} - -} // namespace vllm - -void gather_cached_kv( - torch::Tensor& key, // [out] [num_tokens, num_heads, head_size] - torch::Tensor& value, // [out] [num_tokens, num_heads, head_size] - torch::Tensor& key_cache, // [in] [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& value_cache, // [in] [num_blocks, num_heads, head_size, block_size] - torch::Tensor& slot_mapping) // [in] [num_tokens] -{ - int num_tokens = key.size(0); - int num_heads = key.size(1); - int head_size = key.size(2); - int block_size = key_cache.size(3); - int x = key_cache.size(4); - - int key_stride = key.stride(0); - int value_stride = value.stride(0); - - dim3 grid(num_tokens); - dim3 block(std::min(num_heads * head_size, 512)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(key)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES( - key.scalar_type(), - "gather_cached_kv_kernel_optimized", - [&] { - vllm::gather_cached_kv_kernel_optimized<scalar_t><<<grid, block, 0, stream>>>( - key.data_ptr<scalar_t>(), - value.data_ptr<scalar_t>(), - key_cache.data_ptr<scalar_t>(), - value_cache.data_ptr<scalar_t>(), - slot_mapping.data_ptr<int>(), - key_stride, - value_stride, - num_heads, - head_size, - block_size, - x); - }); -} - -namespace vllm { - -template<typename Tout, typename Tin> -__global__ void convert_fp8_e5m2_kernel( - const Tin* __restrict__ src_cache, - Tout* __restrict__ dst_cache, - const int64_t block_stride) { - const int64_t block_idx = blockIdx.x; - for (int i = threadIdx.x; i < block_stride; i += blockDim.x) { - int64_t idx = block_idx * block_stride + i; -#ifdef ENABLE_FP8_E5M2 - dst_cache[idx] = fp8_e5m2_unscaled::vec_conversion<Tout, Tin>(src_cache[idx]); -#else - assert(false); -#endif - } -} - -} // namespace vllm - -#define CALL_CONVERT_FP8_E5M2(Tout, Tin) \ - vllm::convert_fp8_e5m2_kernel<Tout, Tin><<<grid, block, 0, stream>>>( \ - reinterpret_cast<Tin*>(src_cache.data_ptr()), \ - reinterpret_cast<Tout*>(dst_cache.data_ptr()), \ - block_stride); - -void convert_fp8_e5m2( - torch::Tensor& src_cache, - torch::Tensor& dst_cache) -{ - int64_t num_blocks = src_cache.size(0); - int64_t block_stride = src_cache.stride(0); - - dim3 grid(num_blocks); - dim3 block(std::min(block_stride, int64_t(512))); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - - if (src_cache.dtype() == at::ScalarType::Float) { - CALL_CONVERT_FP8_E5M2(uint8_t, float); - } else if (src_cache.dtype() == at::ScalarType::Half) { - CALL_CONVERT_FP8_E5M2(uint8_t, uint16_t); - } else if (src_cache.dtype() == at::ScalarType::BFloat16) { - CALL_CONVERT_FP8_E5M2(uint8_t, __nv_bfloat16); - } else if (dst_cache.dtype() == at::ScalarType::Float) { - CALL_CONVERT_FP8_E5M2(float, uint8_t); - } else if (dst_cache.dtype() == at::ScalarType::Half) { - CALL_CONVERT_FP8_E5M2(uint16_t, uint8_t); - } else if (dst_cache.dtype() == at::ScalarType::BFloat16) { - CALL_CONVERT_FP8_E5M2(__nv_bfloat16, uint8_t); - } -} diff --git a/csrc/cuda_compat.h b/csrc/cuda_compat.h deleted file mode 100644 index aa58dd73c148a087694209c29b70f6c5e0916678..0000000000000000000000000000000000000000 --- a/csrc/cuda_compat.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#ifndef USE_ROCM - #define VLLM_LDG(arg) __ldg(arg) -#else - #define VLLM_LDG(arg) *(arg) -#endif - -#ifndef USE_ROCM - #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask) -#else - #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask) -#endif - -#ifndef USE_ROCM - #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane) -#else - #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane) -#endif - -#ifndef USE_ROCM - #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ - cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL) -#else - #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ - hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL) -#endif - diff --git a/csrc/cuda_utils.h b/csrc/cuda_utils.h deleted file mode 100644 index 1483484faeb4a59f371f367e56f732ef496ac862..0000000000000000000000000000000000000000 --- a/csrc/cuda_utils.h +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -#include <torch/extension.h> - -int get_device_attribute( - int attribute, - int device_id); - -int get_max_shared_memory_per_block_device_attribute( - int device_id); diff --git a/csrc/cuda_utils_kernels.cu b/csrc/cuda_utils_kernels.cu deleted file mode 100644 index 1a443ef3620ccd38221f9f3a13106e064748b9cc..0000000000000000000000000000000000000000 --- a/csrc/cuda_utils_kernels.cu +++ /dev/null @@ -1,35 +0,0 @@ -#ifdef USE_ROCM - #include <hip/hip_runtime.h> - #include <hip/hip_runtime_api.h> -#endif -int get_device_attribute( - int attribute, - int device_id) -{ - int device, value; - if (device_id < 0) { - cudaGetDevice(&device); - } - else { - device = device_id; - } - cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute), device); - return value; -} - - -int get_max_shared_memory_per_block_device_attribute( - int device_id) -{ -int attribute; -// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html -// cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74 - -#ifdef USE_ROCM - attribute = hipDeviceAttributeMaxSharedMemoryPerBlock; -#else - attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin; -#endif - - return get_device_attribute(attribute, device_id); -} diff --git a/csrc/custom_all_reduce.cu b/csrc/custom_all_reduce.cu deleted file mode 100644 index 88e4af9d4a99f40de5b09b6f12b165f94d79557c..0000000000000000000000000000000000000000 --- a/csrc/custom_all_reduce.cu +++ /dev/null @@ -1,148 +0,0 @@ -#include <ATen/cuda/Exceptions.h> -#include <c10/cuda/CUDAGuard.h> -#include <c10/cuda/CUDAStream.h> -#include <torch/extension.h> - -#include "custom_all_reduce.cuh" - -// fake pointer type -using fptr_t = uint64_t; -static_assert(sizeof(void *) == sizeof(fptr_t)); - -fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data, - const std::vector<std::string> &handles, - const std::vector<int64_t> &offsets, int rank, - bool full_nvlink) { - int world_size = offsets.size(); - if (world_size > 8) - throw std::invalid_argument("world size > 8 is not supported"); - if (world_size % 2 != 0) - throw std::invalid_argument("Odd num gpus is not supported for now"); - if (world_size != handles.size()) - throw std::invalid_argument( - "handles length should equal to offsets length"); - if (rank < 0 || rank >= world_size) - throw std::invalid_argument("invalid rank passed in"); - - cudaIpcMemHandle_t ipc_handles[8]; - for (int i = 0; i < world_size; i++) { - std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(cudaIpcMemHandle_t)); - } - return (fptr_t) new vllm::CustomAllreduce( - reinterpret_cast<vllm::Metadata *>(meta.data_ptr()), rank_data.data_ptr(), - rank_data.numel(), ipc_handles, offsets, rank, full_nvlink); -} - -/** - * Make sure tensor t's data lies completely within ((char)t.data_ptr()) + - * t.numel() * t.element_size(). This is slightly weaker than t.is_contiguous() - * because it allows transpose of contiguous slice (i.e. slicing the first - * dimension). Currently, we require this because stride information is not - * passed into the kernels and we treat input tensors as flat. - * - * Examples - * A = torch.zeros(3, 3, 3) - * 1. A: OK - * 2. A[1:]: OK - * 3. A.permute(2, 0, 1): OK - * 4. A[1:].permute(2, 0, 1): OK - * 5. A[None].expand(2, -1, -1, -1): Not OK - * 6. A[:, 1:, 1:]: Not OK - */ -bool _is_weak_contiguous(torch::Tensor &t) { - return t.is_contiguous() || - (t.storage().nbytes() - t.storage_offset() * t.element_size() == - t.numel() * t.element_size()); -} - -bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size, - bool full_nvlink) { - auto inp_size = inp.numel() * inp.element_size(); - // custom allreduce requires input byte size to be multiples of 16 - if (inp_size % 16 != 0) return false; - if (!_is_weak_contiguous(inp)) return false; - if (world_size == 2 || full_nvlink) return inp_size <= max_size; - // 4 PCIE GPUs use 2 stage allreduce, and is only faster than NCCL when size - // <= 512k - return world_size <= 4 && inp_size <= 512 * 1024; -} - -void _all_reduce(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out, - cudaStream_t stream) { - auto fa = reinterpret_cast<vllm::CustomAllreduce *>(_fa); - TORCH_CHECK(_is_weak_contiguous(out)); - switch (out.scalar_type()) { - case at::ScalarType::Float: { - fa->allreduce<float>(stream, reinterpret_cast<float *>(inp.data_ptr()), - reinterpret_cast<float *>(out.data_ptr()), - out.numel()); - break; - } - case at::ScalarType::Half: { - fa->allreduce<half>(stream, reinterpret_cast<half *>(inp.data_ptr()), - reinterpret_cast<half *>(out.data_ptr()), - out.numel()); - break; - } -#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) - case at::ScalarType::BFloat16: { - fa->allreduce<nv_bfloat16>( - stream, reinterpret_cast<nv_bfloat16 *>(inp.data_ptr()), - reinterpret_cast<nv_bfloat16 *>(out.data_ptr()), out.numel()); - break; - } -#endif - default: - throw std::runtime_error( - "custom allreduce only supports float32, float16 and bfloat16"); - } -} - -void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(inp)); - auto stream = c10::cuda::getCurrentCUDAStream().stream(); - TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type()); - TORCH_CHECK_EQ(inp.numel(), out.numel()); - _all_reduce(_fa, inp, out, stream); -} - -void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor ®_buffer, - torch::Tensor &out) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(inp)); - auto stream = c10::cuda::getCurrentCUDAStream().stream(); - - auto input_size = inp.numel() * inp.element_size(); - TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type()); - TORCH_CHECK_EQ(inp.numel(), out.numel()); - TORCH_CHECK(input_size <= reg_buffer.numel() * reg_buffer.element_size(), - "registered buffer is too small to contain the input"); - AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer.data_ptr(), inp.data_ptr(), - input_size, cudaMemcpyDeviceToDevice, stream)); - _all_reduce(_fa, reg_buffer, out, stream); -} - -void dispose(fptr_t _fa) { - auto fa = reinterpret_cast<vllm::CustomAllreduce *>(_fa); - delete fa; -} - -int meta_size() { return sizeof(vllm::Metadata); } - -void register_buffer(fptr_t _fa, torch::Tensor &t, - const std::vector<std::string> &handles, - const std::vector<int64_t> &offsets) { - auto fa = reinterpret_cast<vllm::CustomAllreduce *>(_fa); - fa->register_buffer(handles, offsets, t.data_ptr()); -} - -std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta( - fptr_t _fa) { - auto fa = reinterpret_cast<vllm::CustomAllreduce *>(_fa); - return fa->get_graph_buffer_ipc_meta(); -} - -void register_graph_buffers(fptr_t _fa, const std::vector<std::string> &handles, - const std::vector<std::vector<int64_t>> &offsets) { - auto fa = reinterpret_cast<vllm::CustomAllreduce *>(_fa); - fa->register_graph_buffers(handles, offsets); -} diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh deleted file mode 100644 index 54409e19eb4556c57db122a8526e8402a234d7b0..0000000000000000000000000000000000000000 --- a/csrc/custom_all_reduce.cuh +++ /dev/null @@ -1,562 +0,0 @@ -#pragma once - -#include <cuda.h> -#include <cuda_bf16.h> -#include <cuda_fp16.h> -#include <cuda_runtime.h> - -#include <iostream> -#include <limits> -#include <map> -#include <unordered_map> -#include <vector> - -#define CUDACHECK(cmd) \ - do { \ - cudaError_t e = cmd; \ - if (e != cudaSuccess) { \ - printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \ - cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) - -namespace vllm { - -struct Signal { - alignas(64) union { - uint64_t flag; - unsigned char data[8]; - } start; - alignas(64) union { - uint64_t flag; - unsigned char data[8]; - } end; -}; - -struct Metadata { - alignas(128) Signal sg; - alignas(128) int counter; -}; -static_assert(offsetof(Metadata, counter) == 128); -static_assert(sizeof(Metadata) == 256); - -struct __align__(16) RankData { const void *__restrict__ ptrs[8]; }; - -struct RankSignals { - volatile Signal *signals[8]; -}; - -// like std::array, but aligned -template <typename T, int sz> -struct __align__(alignof(T) * sz) array_t { - T data[sz]; - using type = T; - static constexpr int size = sz; -}; - -// use packed type to maximize memory efficiency -// goal: generate ld.128 and st.128 instructions -template <typename T> -struct packed_t { - // the (P)acked type for load/store - using P = array_t<T, 16 / sizeof(T)>; - // the (A)ccumulator type for reduction - using A = array_t<float, 16 / sizeof(T)>; -}; - -#define DINLINE __device__ __forceinline__ - -// scalar cast functions -DINLINE float upcast_s(half val) { return __half2float(val); } - -template <typename T> -DINLINE T downcast_s(float val); -template <> -DINLINE half downcast_s(float val) { - return __float2half(val); -} - -// scalar add functions -// for some reason when compiling with Pytorch, the + operator for half and -// bfloat is disabled so we call the intrinsics directly -DINLINE half &assign_add(half &a, half b) { - a = __hadd(a, b); - return a; -} -DINLINE float &assign_add(float &a, float b) { return a += b; } - -#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) -DINLINE float upcast_s(nv_bfloat16 val) { return __bfloat162float(val); } -template <> -DINLINE nv_bfloat16 downcast_s(float val) { - return __float2bfloat16(val); -} -DINLINE nv_bfloat16 &assign_add(nv_bfloat16 &a, nv_bfloat16 b) { - a = __hadd(a, b); - return a; -} -#endif - -template <typename T, int N> -DINLINE array_t<T, N> &packed_assign_add(array_t<T, N> &a, array_t<T, N> b) { -#pragma unroll - for (int i = 0; i < N; i++) { - assign_add(a.data[i], b.data[i]); - } - return a; -} - -template <typename T, int N> -DINLINE array_t<float, N> upcast(array_t<T, N> val) { - if constexpr (std::is_same<T, float>::value) { - return val; - } else { - array_t<float, N> out; -#pragma unroll - for (int i = 0; i < N; i++) { - out.data[i] = upcast_s(val.data[i]); - } - return out; - } -} - -template <typename O> -DINLINE O downcast(array_t<float, O::size> val) { - if constexpr (std::is_same<typename O::type, float>::value) { - return val; - } else { - O out; -#pragma unroll - for (int i = 0; i < O::size; i++) { - out.data[i] = downcast_s<typename O::type>(val.data[i]); - } - return out; - } -} - -// compute flag at compile time -__host__ __device__ constexpr uint64_t compute_flag(int ngpus) { - auto m = std::numeric_limits<uint64_t>::max(); - return m >> ((8 - ngpus) * 8); -} - -template <int ngpus> -DINLINE void start_sync(const RankSignals &sg, volatile Metadata *meta, - int rank) { - constexpr auto FLAG = compute_flag(ngpus); - if (blockIdx.x == 0) { - if (threadIdx.x < ngpus) - // simultaneously write to the corresponding byte to all other ranks. - // Latency = 1 p2p write - sg.signals[threadIdx.x]->start.data[rank] = 255; - else if (threadIdx.x == 32) - // reset - meta->sg.end.flag = 0; - } - if (threadIdx.x == 0) { - while (meta->sg.start.flag != FLAG) - ; - } - __syncthreads(); -} - -template <int ngpus, bool final_sync = false> -DINLINE void end_sync(const RankSignals &sg, volatile Metadata *meta, - int rank) { - constexpr auto FLAG = compute_flag(ngpus); - __syncthreads(); - __shared__ int num; - if (threadIdx.x == 0) num = atomicAdd((int *)&meta->counter, 1); - __syncthreads(); - - // Only the last completing block can perform the end synchronization - // This can ensures when the final busy wait ends, all ranks must have - // finished reading each other's buffer. - if (num == gridDim.x - 1) { - if (threadIdx.x == 32) { - // reset in a different warp - meta->counter = 0; - meta->sg.start.flag = 0; - } else if (threadIdx.x < ngpus) { - // simultaneously write to the corresponding byte to all other ranks. - // Latency = 1 p2p write - sg.signals[threadIdx.x]->end.data[rank] = 255; - } - // if this is the final sync, only one block needs it - // because kernel exit can serve as sync - if constexpr (final_sync) { - if (threadIdx.x == 0) { - while (meta->sg.end.flag != FLAG) - ; - } - } - } - if constexpr (!final_sync) { - if (threadIdx.x == 0) { - while (meta->sg.end.flag != FLAG) - ; - } - __syncthreads(); - } -} - -template <typename P, int ngpus, typename A> -DINLINE P packed_reduce(const P *ptrs[], int idx) { - A tmp = upcast(ptrs[0][idx]); -#pragma unroll - for (int i = 1; i < ngpus; i++) { - packed_assign_add(tmp, upcast(ptrs[i][idx])); - } - return downcast<P>(tmp); -} - -template <typename T, int ngpus> -__global__ void __launch_bounds__(512, 1) - cross_device_reduce_1stage(RankData *_dp, RankSignals sg, - volatile Metadata *meta, T *__restrict__ result, - int rank, int size) { - using P = typename packed_t<T>::P; - using A = typename packed_t<T>::A; - // note: we don't reorder the address so the accumulation order is the same - // for all ranks, ensuring bitwise identical results - auto dp = *_dp; - start_sync<ngpus>(sg, meta, rank); - // do the actual reduction - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; - idx += gridDim.x * blockDim.x) { - ((P *)result)[idx] = - packed_reduce<P, ngpus, A>((const P **)&dp.ptrs[0], idx); - } - end_sync<ngpus, true>(sg, meta, rank); -} - -template <typename P> -DINLINE P *get_tmp_buf(volatile Signal *sg) { - return (P *)(((Metadata *)sg) + 1); -} - -template <typename T, int ngpus> -__global__ void __launch_bounds__(512, 1) - cross_device_reduce_2stage(RankData *_dp, RankSignals sg, - volatile Metadata *meta, T *__restrict__ result, - int rank, int size) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = gridDim.x * blockDim.x; - using P = typename packed_t<T>::P; - using A = typename packed_t<T>::A; - int part = size / ngpus; - int start = rank * part; - int end = rank == ngpus - 1 ? size : start + part; - const P *ptrs[ngpus]; - P *tmps[ngpus]; -#pragma unroll - for (int i = 0; i < ngpus; i++) { - int target = (rank + i) % ngpus; - ptrs[i] = (const P *)_dp->ptrs[target]; - tmps[i] = get_tmp_buf<P>(sg.signals[target]); - } - auto tmp_out = tmps[0]; - start_sync<ngpus>(sg, meta, rank); - // stage 1: reduce scatter - for (int idx = start + tid; idx < end; idx += stride) { - tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx); - } - // Maybe TODO: replace this with per-block release-acquire - // can save about 1-2us (not a lot though) - end_sync<ngpus>(sg, meta, rank); - - // stage 2: allgather - for (int idx = tid; idx < part; idx += stride) { -#pragma unroll - for (int i = 0; i < ngpus; i++) { - int dst_idx = ((rank + i) % ngpus) * part + idx; - ((P *)result)[dst_idx] = tmps[i][idx]; - } - } - // process the last larger partition - int remaining = size - part * ngpus; - if (tid < remaining) { - int dst_idx = tid + part * ngpus; - ((P *)result)[dst_idx] = get_tmp_buf<P>(sg.signals[ngpus - 1])[part + tid]; - } - - // faster than this - // for (int idx = tid; idx < size; idx += stride) { - // int target_rank = idx / part; - // if (target_rank == ngpus) target_rank -= 1; - // ((P *)result)[idx] = tmps[target_rank][idx - target_rank * part]; - // } -} - -template <typename T, int ngpus> -__global__ void __launch_bounds__(512, 1) - cross_device_reduce_half_butterfly(RankData *_dp, RankSignals sg, - volatile Metadata *meta, - T *__restrict__ result, int rank, - int size) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = gridDim.x * blockDim.x; - using P = typename packed_t<T>::P; - using A = typename packed_t<T>::A; - auto tmp_out = get_tmp_buf<P>(sg.signals[rank]); - constexpr int hg = ngpus / 2; - // Actually not quite half butterfly. - // This is an all-to-all within each group containing half of the ranks - // followed by cross-group add. Equivalent to half butterfly when there - // are 4 GPUs, a common case for PCIe cards like T4 and A10. - const P *ptrs[hg]; - { - int start = rank - rank % hg; -#pragma unroll - for (int i = 0; i < hg; i++) { - ptrs[i] = (const P *)_dp->ptrs[i + start]; - } - } - start_sync<ngpus>(sg, meta, rank); - for (int idx = tid; idx < size; idx += stride) { - tmp_out[idx] = packed_reduce<P, hg, A>(ptrs, idx); - } - end_sync<ngpus>(sg, meta, rank); - - auto src = get_tmp_buf<P>(sg.signals[(ngpus - 1) - rank % ngpus]); - // do the cross group reduction - for (int idx = tid; idx < size; idx += stride) { - auto tmp = tmp_out[idx]; - packed_assign_add(tmp, src[idx]); - ((P *)result)[idx] = tmp; - } -} - -using IPC_KEY = std::array<uint8_t, sizeof(cudaIpcMemHandle_t)>; -static_assert(sizeof(IPC_KEY) == sizeof(cudaIpcMemHandle_t)); -static_assert(alignof(IPC_KEY) == alignof(cudaIpcMemHandle_t)); - -class CustomAllreduce { - public: - int rank_; - int world_size_; - bool full_nvlink_; - - // below are device pointers - RankSignals sg_; - std::unordered_map<void *, RankData *> buffers_; - Metadata *meta_; - - // stores the registered device pointers from all ranks - RankData *d_rank_data_base_, *d_rank_data_end_; - std::vector<void *> graph_unreg_buffers_; - // a map from IPC handles to opened IPC pointers - std::map<IPC_KEY, char *> ipc_handles_; - - /** - * meta is a pointer to device metadata and temporary buffer for allreduce. - * - * There's a total of sizeof(Metadata) of prefix before the actual data, - * so meta + 1 points to actual temporary buffer. - * - * note: this class does not own any device memory. Any required buffers - * are passed in from the constructor - */ - CustomAllreduce(Metadata *meta, void *rank_data, size_t rank_data_sz, - const cudaIpcMemHandle_t *handles, - const std::vector<int64_t> &offsets, int rank, - bool full_nvlink = true) - : rank_(rank), - world_size_(offsets.size()), - full_nvlink_(full_nvlink), - meta_(meta), - d_rank_data_base_(reinterpret_cast<RankData *>(rank_data)), - d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) { - for (int i = 0; i < world_size_; i++) { - Metadata *rank_meta; - if (i != rank_) { - char *handle = open_ipc_handle(&handles[i]); - handle += offsets[i]; - rank_meta = (Metadata *)handle; - } else { - rank_meta = meta_; - } - sg_.signals[i] = &rank_meta->sg; - } - } - - char *open_ipc_handle(const void *ipc_handle) { - auto [it, new_handle] = - ipc_handles_.insert({*((IPC_KEY *)ipc_handle), nullptr}); - if (new_handle) { - char *ipc_ptr; - CUDACHECK(cudaIpcOpenMemHandle((void **)&ipc_ptr, - *((const cudaIpcMemHandle_t *)ipc_handle), - cudaIpcMemLazyEnablePeerAccess)); - it->second = ipc_ptr; - } - return it->second; - } - - std::pair<std::vector<uint8_t>, std::vector<int64_t>> - get_graph_buffer_ipc_meta() { - auto num_buffers = graph_unreg_buffers_.size(); - auto handle_sz = sizeof(cudaIpcMemHandle_t); - std::vector<uint8_t> handles(handle_sz * num_buffers, 0); - std::vector<int64_t> offsets(num_buffers); - for (int i = 0; i < num_buffers; i++) { - auto ptr = graph_unreg_buffers_[i]; - void *base_ptr; - // note: must share the base address of each allocation, or we get wrong - // address - if (cuPointerGetAttribute(&base_ptr, - CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, - (CUdeviceptr)ptr) != CUDA_SUCCESS) - throw std::runtime_error("failed to get pointer attr"); - CUDACHECK(cudaIpcGetMemHandle( - (cudaIpcMemHandle_t *)&handles[i * handle_sz], base_ptr)); - offsets[i] = ((char *)ptr) - ((char *)base_ptr); - } - return std::make_pair(handles, offsets); - } - - void check_rank_data_capacity(size_t num = 1) { - if (d_rank_data_base_ + num > d_rank_data_end_) - throw std::runtime_error( - "Rank data buffer is overflowed by " + - std::to_string(d_rank_data_base_ + num - d_rank_data_end_)); - } - - void register_buffer(const std::vector<std::string> &handles, - const std::vector<int64_t> &offsets, void *self) { - check_rank_data_capacity(); - RankData data; - for (int i = 0; i < world_size_; i++) { - if (i != rank_) { - char *handle = open_ipc_handle(handles[i].data()); - handle += offsets[i]; - data.ptrs[i] = handle; - } else { - data.ptrs[i] = self; - } - } - auto d_data = d_rank_data_base_++; - CUDACHECK( - cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice)); - buffers_[self] = d_data; - } - - // note: when registering graph buffers, we intentionally choose to not - // deduplicate the addresses. That means if the allocator reuses some - // addresses, they will be registered again. This is to account for the remote - // possibility of different allocation patterns between ranks. For example, - // rank 1 may get the same input address for the second allreduce, but rank 2 - // got a different address. IPC handles have internal reference counting - // mechanism so overhead should be small. - void register_graph_buffers( - const std::vector<std::string> &handles, - const std::vector<std::vector<int64_t>> &offsets) { - auto num_buffers = graph_unreg_buffers_.size(); - check_rank_data_capacity(num_buffers); - std::vector<RankData> rank_data(num_buffers); - for (int i = 0; i < num_buffers; i++) { - auto self_ptr = graph_unreg_buffers_[i]; - auto &rd = rank_data[i]; - for (int j = 0; j < world_size_; j++) { - if (j != rank_) { - char *handle = - open_ipc_handle(&handles[j][i * sizeof(cudaIpcMemHandle_t)]); - handle += offsets[j][i]; - rd.ptrs[j] = handle; - } else { - rd.ptrs[j] = self_ptr; - } - } - } - CUDACHECK(cudaMemcpy(d_rank_data_base_, rank_data.data(), - sizeof(RankData) * num_buffers, - cudaMemcpyHostToDevice)); - d_rank_data_base_ += num_buffers; - graph_unreg_buffers_.clear(); - } - - /** - * This is the result after careful grid search. Using 36 blocks give the best - * or close to the best runtime on the devices I tried: A100, A10, A30, T4, - * V100. You'll notice that NCCL kernels also only take a small amount of SMs. - * Not quite sure the underlying reason, but my guess is that too many SMs - * will cause contention on NVLink bus. - */ - template <typename T> - void allreduce(cudaStream_t stream, T *input, T *output, int size, - int threads = 512, int block_limit = 36) { - auto d = packed_t<T>::P::size; - if (size % d != 0) - throw std::runtime_error( - "custom allreduce currently requires input length to be multiple " - "of " + - std::to_string(d)); - - RankData *ptrs; - cudaStreamCaptureStatus status; - CUDACHECK(cudaStreamIsCapturing(stream, &status)); - if (status == cudaStreamCaptureStatusActive) { - ptrs = d_rank_data_base_ + graph_unreg_buffers_.size(); - graph_unreg_buffers_.push_back(input); - } else { - auto it = buffers_.find(input); - if (it == buffers_.end()) - throw std::runtime_error( - "buffer address " + - std::to_string(reinterpret_cast<uint64_t>(input)) + - " is not registered!"); - ptrs = it->second; - } - - size /= d; - auto bytes = size * sizeof(typename packed_t<T>::P); - int blocks = std::min(block_limit, (size + threads - 1) / threads); -#define KL(ngpus, name) \ - name<T, ngpus> \ - <<<blocks, threads, 0, stream>>>(ptrs, sg_, meta_, output, rank_, size); -#define REDUCE_CASE(ngpus) \ - case ngpus: { \ - if (world_size_ == 2) { \ - KL(ngpus, cross_device_reduce_1stage); \ - } else if (full_nvlink_) { \ - if ((world_size_ <= 4 && bytes < 512 * 1024) || \ - (world_size_ <= 8 && bytes < 256 * 1024)) { \ - KL(ngpus, cross_device_reduce_1stage); \ - } else { \ - KL(ngpus, cross_device_reduce_2stage); \ - } \ - } else { \ - KL(ngpus, cross_device_reduce_half_butterfly); \ - } \ - break; \ - } - - switch (world_size_) { - REDUCE_CASE(2) - REDUCE_CASE(4) - REDUCE_CASE(6) - REDUCE_CASE(8) - default: - throw std::runtime_error( - "custom allreduce only supports num gpus in (2,4,6,8). Actual num " - "gpus = " + - std::to_string(world_size_)); - } -#undef REDUCE_CASE -#undef KL - } - - ~CustomAllreduce() { - for (auto [_, ptr] : ipc_handles_) { - CUDACHECK(cudaIpcCloseMemHandle(ptr)); - } - } -}; -/** - * To inspect PTX/SASS, copy paste this header file to compiler explorer and add - a template instantiation: - * template void CustomAllreduce::allreduce<half>(cudaStream_t, half *, half *, - int, int, int); -*/ -} // namespace vllm diff --git a/csrc/custom_all_reduce_test.cu b/csrc/custom_all_reduce_test.cu deleted file mode 100644 index 6b094e2fdc9ba13ab9111fe68012f59491237c0b..0000000000000000000000000000000000000000 --- a/csrc/custom_all_reduce_test.cu +++ /dev/null @@ -1,284 +0,0 @@ -/** - * This is a standalone test for custom allreduce. - * To compile, make sure you have MPI and NCCL installed in your system. - * export MPI_HOME=XXX - * nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o - * custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi - * - * Warning: this C++ test is not designed to be very readable and was used - * during the rapid prototyping process. - * - * To run: - * mpirun -np 8 ./custom_all_reduce_test - */ -#include <cuda.h> -#include <curand_kernel.h> -#include <stdio.h> -#include <stdlib.h> - -#include <limits> -#include <vector> - -#include "cuda_profiler_api.h" -#include "custom_all_reduce.cuh" -#include "mpi.h" -#include "nccl.h" - -#define MPICHECK(cmd) \ - do { \ - int e = cmd; \ - if (e != MPI_SUCCESS) { \ - printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) - -#define NCCLCHECK(cmd) \ - do { \ - ncclResult_t r = cmd; \ - if (r != ncclSuccess) { \ - printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \ - ncclGetErrorString(r)); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) - -__global__ void dummy_kernel() { - for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms -} - -template <typename T> -__global__ void set_data(T *data, int size, int myRank) { - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; - idx += gridDim.x * blockDim.x) { - data[idx] = myRank * 0.11f; - } -} - -template <typename T> -__global__ void convert_data(const T *data1, const T *data2, double *fdata1, - double *fdata2, int size) { - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; - idx += gridDim.x * blockDim.x) { - fdata1[idx] = data1[idx]; - fdata2[idx] = data2[idx]; - } -} - -__global__ void init_rand(curandState_t *state, int size, int nRanks) { - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; - idx += gridDim.x * blockDim.x) { - for (int i = 0; i < nRanks; i++) { - curand_init(i + 1, idx, 0, &state[idx * nRanks + i]); - } - } -} - -template <typename T> -__global__ void gen_data(curandState_t *state, T *data, double *ground_truth, - int myRank, int nRanks, int size) { - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; - idx += gridDim.x * blockDim.x) { - double sum = 0.0; - for (int i = 0; i < nRanks; i++) { - double val = curand_uniform_double(&state[idx * nRanks + i]) * 4; - T hval = val; // downcast first - sum += static_cast<double>(hval); - if (i == myRank) data[idx] = hval; - } - ground_truth[idx] = sum; - } -} - -template <typename T> -void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit, - int data_size) { - T *result; - cudaStream_t stream; - CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - CUDACHECK(cudaMalloc(&result, data_size * sizeof(T))); - CUDACHECK(cudaMemset(result, 0, data_size * sizeof(T))); - - cudaIpcMemHandle_t self_data_handle; - cudaIpcMemHandle_t data_handles[8]; - vllm::Metadata *buffer; - T *self_data_copy; - /** - * Allocate IPC buffer - * - * The first section is a temporary buffer for storing intermediate allreduce - * results, if a particular algorithm requires it. The second section is for - * the input to the allreduce. The actual API takes the input pointer as an - * argument (that is, they can and usually should be allocated separately). - * But since the input pointers and the temporary buffer all require IPC - * registration, they are allocated and registered together in the test for - * convenience. - */ - CUDACHECK( - cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Metadata))); - CUDACHECK(cudaMemset(buffer, 0, - 2 * data_size * sizeof(T) + sizeof(vllm::Metadata))); - CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T))); - CUDACHECK(cudaIpcGetMemHandle(&self_data_handle, buffer)); - - MPICHECK(MPI_Allgather(&self_data_handle, sizeof(cudaIpcMemHandle_t), - MPI_BYTE, data_handles, sizeof(cudaIpcMemHandle_t), - MPI_BYTE, MPI_COMM_WORLD)); - - void *rank_data; - size_t rank_data_sz = 16 * 1024 * 1024; - CUDACHECK(cudaMalloc(&rank_data, rank_data_sz)); - std::vector<int64_t> offsets(nRanks, 0); - vllm::CustomAllreduce fa(buffer, rank_data, rank_data_sz, data_handles, - offsets, myRank); - auto *self_data = - reinterpret_cast<T *>(reinterpret_cast<char *>(buffer) + - sizeof(vllm::Metadata) + data_size * sizeof(T)); - // hack buffer registration - { - std::vector<std::string> handles; - handles.reserve(nRanks); - for (int i = 0; i < nRanks; i++) { - char *begin = (char *)&data_handles[i]; - char *end = (char *)&data_handles[i + 1]; - handles.emplace_back(begin, end); - } - std::vector<int64_t> offsets( - nRanks, sizeof(vllm::Metadata) + data_size * sizeof(T)); - fa.register_buffer(handles, offsets, self_data); - } - - double *ground_truth; - CUDACHECK(cudaMallocHost(&ground_truth, data_size * sizeof(double))); - curandState_t *states; - CUDACHECK(cudaMalloc(&states, sizeof(curandState_t) * nRanks * data_size)); - init_rand<<<108, 1024, 0, stream>>>(states, data_size, nRanks); - gen_data<T><<<108, 1024, 0, stream>>>(states, self_data, ground_truth, myRank, - nRanks, data_size); - CUDACHECK(cudaMemcpyAsync(self_data_copy, self_data, data_size * sizeof(T), - cudaMemcpyDeviceToDevice, stream)); - cudaEvent_t start, stop; - CUDACHECK(cudaEventCreate(&start)); - CUDACHECK(cudaEventCreate(&stop)); - - ncclDataType_t ncclDtype; - if (std::is_same<T, half>::value) { - ncclDtype = ncclFloat16; - } else if (std::is_same<T, nv_bfloat16>::value) { - ncclDtype = ncclBfloat16; - } else { - ncclDtype = ncclFloat; - } - - dummy_kernel<<<1, 1, 0, stream>>>(); - constexpr int warmup_iters = 5; - constexpr int num_iters = 25; - // warmup - for (int i = 0; i < warmup_iters; i++) { - NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum, comm, - stream)); - } - CUDACHECK(cudaEventRecord(start, stream)); - for (int i = 0; i < num_iters; i++) { - NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum, comm, - stream)); - } - CUDACHECK(cudaEventRecord(stop, stream)); - CUDACHECK(cudaStreamSynchronize(stream)); - float allreduce_ms = 0; - cudaEventElapsedTime(&allreduce_ms, start, stop); - - // if (myRank == 1) dummy_kernel<<<1, 1, 0, stream>>>(); - // set_data<T><<<16, 1024, 0, stream>>>(self_data, data_size, myRank); - - dummy_kernel<<<1, 1, 0, stream>>>(); - // warm up - for (int i = 0; i < warmup_iters; i++) { - fa.allreduce<T>(stream, self_data, result, data_size, threads, block_limit); - } - CUDACHECK(cudaEventRecord(start, stream)); - for (int i = 0; i < num_iters; i++) { - fa.allreduce<T>(stream, self_data, result, data_size, threads, block_limit); - } - CUDACHECK(cudaEventRecord(stop, stream)); - CUDACHECK(cudaStreamSynchronize(stream)); - - float duration_ms = 0; - cudaEventElapsedTime(&duration_ms, start, stop); - if (myRank == 0) - printf( - "Rank %d done, nGPUs:%d, sz (kb): %d, %d, %d, my time:%.2fus, nccl " - "time:%.2fus\n", - myRank, nRanks, data_size * sizeof(T) / 1024, threads, block_limit, - duration_ms * 1e3 / num_iters, allreduce_ms * 1e3 / num_iters); - - // And wait for all the queued up work to complete - CUDACHECK(cudaStreamSynchronize(stream)); - - NCCLCHECK(ncclAllReduce(self_data_copy, self_data, data_size, ncclDtype, - ncclSum, comm, stream)); - - double *nccl_result, *my_result; - CUDACHECK(cudaMallocHost(&nccl_result, data_size * sizeof(double))); - CUDACHECK(cudaMallocHost(&my_result, data_size * sizeof(double))); - - convert_data<T><<<108, 1024, 0, stream>>>(self_data, result, nccl_result, - my_result, data_size); - CUDACHECK(cudaStreamSynchronize(stream)); - - for (unsigned long j = 0; j < data_size; j++) { - auto diff = abs(nccl_result[j] - my_result[j]); - if (diff >= 1e-2) { - printf("Rank %d: Verification mismatch at %lld: %f != (my) %f, gt=%f\n", - myRank, j, nccl_result[j], my_result[j], ground_truth[j]); - break; - } - } - - long double nccl_diffs = 0.0; - long double my_diffs = 0.0; - for (int j = 0; j < data_size; j++) { - nccl_diffs += abs(nccl_result[j] - ground_truth[j]); - my_diffs += abs(my_result[j] - ground_truth[j]); - } - if (myRank == 0) - std::cout << "average abs diffs: nccl: " << nccl_diffs / data_size - << " me: " << my_diffs / data_size << std::endl; - - CUDACHECK(cudaFree(result)); - CUDACHECK(cudaFree(self_data_copy)); - CUDACHECK(cudaFree(rank_data)); - CUDACHECK(cudaFree(buffer)); - CUDACHECK(cudaFree(states)); - CUDACHECK(cudaFreeHost(ground_truth)); - CUDACHECK(cudaFreeHost(nccl_result)); - CUDACHECK(cudaFreeHost(my_result)); - CUDACHECK(cudaStreamDestroy(stream)); -} - -int main(int argc, char **argv) { - int nRanks, myRank; - MPICHECK(MPI_Init(&argc, &argv)); - MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank)); - MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks)); - CUDACHECK(cudaSetDevice(myRank)); - ncclUniqueId id; - ncclComm_t comm; - if (myRank == 0) ncclGetUniqueId(&id); - MPICHECK(MPI_Bcast(static_cast<void *>(&id), sizeof(id), MPI_BYTE, 0, - MPI_COMM_WORLD)); - NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank)); - - cudaProfilerStart(); - // for (int threads : {256, 512}) { - // for (int block_limit = 16; block_limit < 112; block_limit += 4) { - // run<half>(myRank, nRanks, comm, threads, block_limit, 4096 * 1024); - // } - // } - for (int sz = 512; sz <= (32 << 20); sz *= 2) { - run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 50); - } - - cudaProfilerStop(); - return EXIT_SUCCESS; -} diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h deleted file mode 100644 index 91abd9e85b4bb07a548ec3e3b8c4e390a7d79426..0000000000000000000000000000000000000000 --- a/csrc/dispatch_utils.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Adapted from - * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h - */ -#pragma once - -#include <torch/extension.h> - -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) - -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_SWITCH( \ - TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) - -#define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) - -#define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_SWITCH( \ - TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__)) - -#define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) - -#define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_SWITCH( \ - TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__)) diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu deleted file mode 100644 index 6d34d014c858e318f11700582071c8b297324b92..0000000000000000000000000000000000000000 --- a/csrc/layernorm_kernels.cu +++ /dev/null @@ -1,120 +0,0 @@ -#include <torch/extension.h> -#include <ATen/cuda/CUDAContext.h> -#include <c10/cuda/CUDAGuard.h> - -#include "dispatch_utils.h" -#include "reduction_utils.cuh" - -namespace vllm { - -// TODO(woosuk): Further optimize this kernel. -template<typename scalar_t> -__global__ void rms_norm_kernel( - scalar_t* __restrict__ out, // [..., hidden_size] - const scalar_t* __restrict__ input, // [..., hidden_size] - const scalar_t* __restrict__ weight, // [hidden_size] - const float epsilon, - const int num_tokens, - const int hidden_size) { - __shared__ float s_variance; - float variance = 0.0f; - - for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - const float x = (float) input[blockIdx.x * hidden_size + idx]; - variance += x * x; - } - variance = blockReduceSum<float>(variance); - if (threadIdx.x == 0) { - s_variance = rsqrtf(variance / hidden_size + epsilon); - } - __syncthreads(); - - for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - float x = (float) input[blockIdx.x * hidden_size + idx]; - out[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx]; - } -} - -// TODO: Further optimize this kernel. -template<typename scalar_t> -__global__ void fused_add_rms_norm_kernel( - scalar_t* __restrict__ input, // [..., hidden_size] - scalar_t* __restrict__ residual, // [..., hidden_size] - const scalar_t* __restrict__ weight, // [hidden_size] - const float epsilon, - const int num_tokens, - const int hidden_size) { - __shared__ float s_variance; - float variance = 0.0f; - - for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - float x = (float) input[blockIdx.x * hidden_size + idx]; - x += (float) residual[blockIdx.x * hidden_size + idx]; - variance += x * x; - residual[blockIdx.x * hidden_size + idx] = (scalar_t) x; - } - variance = blockReduceSum<float>(variance); - if (threadIdx.x == 0) { - s_variance = rsqrtf(variance / hidden_size + epsilon); - } - __syncthreads(); - - for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { - float x = (float) residual[blockIdx.x * hidden_size + idx]; - input[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx]; - } -} - -} // namespace vllm - -void rms_norm( - torch::Tensor& out, // [..., hidden_size] - torch::Tensor& input, // [..., hidden_size] - torch::Tensor& weight, // [hidden_size] - float epsilon) { - int hidden_size = input.size(-1); - int num_tokens = input.numel() / hidden_size; - - dim3 grid(num_tokens); - dim3 block(std::min(hidden_size, 1024)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), - "rms_norm_kernel", - [&] { - vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>( - out.data_ptr<scalar_t>(), - input.data_ptr<scalar_t>(), - weight.data_ptr<scalar_t>(), - epsilon, - num_tokens, - hidden_size); - }); -} - -void fused_add_rms_norm( - torch::Tensor& input, // [..., hidden_size] - torch::Tensor& residual, // [..., hidden_size] - torch::Tensor& weight, // [hidden_size] - float epsilon) { - int hidden_size = input.size(-1); - int num_tokens = input.numel() / hidden_size; - - dim3 grid(num_tokens); - dim3 block(std::min(hidden_size, 1024)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), - "fused_add_rms_norm_kernel", - [&] { - vllm::fused_add_rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>( - input.data_ptr<scalar_t>(), - residual.data_ptr<scalar_t>(), - weight.data_ptr<scalar_t>(), - epsilon, - num_tokens, - hidden_size); - }); -} diff --git a/csrc/moe_align_block_size_kernels.cu b/csrc/moe_align_block_size_kernels.cu deleted file mode 100644 index de6a0ec0a972c39ddbec276c76767eeff2085819..0000000000000000000000000000000000000000 --- a/csrc/moe_align_block_size_kernels.cu +++ /dev/null @@ -1,108 +0,0 @@ -#include <torch/extension.h> -#include <ATen/cuda/CUDAContext.h> - -#include <ATen/ATen.h> -#include <THC/THCAtomics.cuh> - -#include "cuda_compat.h" -#include "dispatch_utils.h" - -const static size_t NUM_MAX_EXPERTS = 64; -#define CEILDIV(x,y) (((x) + (y) - 1) / (y)) - -namespace vllm { -template <typename scalar_t> -__global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, - int32_t *sorted_token_ids, - int32_t *expert_ids, - int32_t *total_tokens_post_pad, - int32_t num_experts, - int32_t block_size, - size_t numel) { - const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); - const size_t start_idx = threadIdx.x * tokens_per_thread; - __shared__ int32_t tokens_cnts[NUM_MAX_EXPERTS + 1][NUM_MAX_EXPERTS]; - __shared__ int32_t cumsum[NUM_MAX_EXPERTS + 1]; - for (int i = 0; i < num_experts; ++i) { - tokens_cnts[threadIdx.x + 1][i] = 0; - } - - /** - * In the first step we compute token_cnts[thread_index + 1][expert_index], - * which counts how many tokens in the token shard of thread_index are assigned - * to expert expert_index. - */ - for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - ++tokens_cnts[threadIdx.x + 1][topk_ids[i]]; - } - - __syncthreads(); - - // For each expert we accumulate the token counts from the different threads. - tokens_cnts[0][threadIdx.x] = 0; - for (int i = 1; i <= blockDim.x; ++i) { - tokens_cnts[i][threadIdx.x] += tokens_cnts[i-1][threadIdx.x]; - } - - __syncthreads(); - - // We accumulate the token counts of all experts in thread 0. - if (threadIdx.x == 0) { - cumsum[0] = 0; - for (int i = 1; i <= num_experts; ++i) { - cumsum[i] = cumsum[i-1] + CEILDIV(tokens_cnts[blockDim.x][i - 1], block_size) * block_size; - } - *total_tokens_post_pad = cumsum[num_experts]; - } - - __syncthreads(); - - /** - * For each expert, each thread processes the tokens of the corresponding blocks - * and stores the corresponding expert_id for each block. - */ - for (int i = cumsum[threadIdx.x];i < cumsum[threadIdx.x + 1];i += block_size) { - expert_ids[i / block_size] = threadIdx.x; - } - - /** - * Each thread processes a token shard, calculating the index of each token after - * sorting by expert number. Given the example topk_ids = [0,1,2,1,2,3,0,3,4] and - * block_size = 4, then the output would be [0, 6, *, *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], - * where * represents a padding value(preset in python). - */ - for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - int32_t expert_id = topk_ids[i]; - /** The cumsum[expert_id] stores the starting index of the tokens that the - * expert with expert_id needs to process, and tokens_cnts[threadIdx.x][expert_id] - * stores the indices of the tokens processed by the expert with expert_id within - * the current thread's token shard. - */ - int32_t rank_post_pad = tokens_cnts[threadIdx.x][expert_id] + cumsum[expert_id]; - sorted_token_ids[rank_post_pad] = i; - ++tokens_cnts[threadIdx.x][expert_id]; - } -} -} - -void moe_align_block_size( - torch::Tensor topk_ids, - int num_experts, - int block_size, - torch::Tensor sorted_token_ids, - torch::Tensor experts_ids, - torch::Tensor num_tokens_post_pad) { - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - assert(num_experts <= NUM_MAX_EXPERTS); - VLLM_DISPATCH_INTEGRAL_TYPES( - topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { - vllm::moe_align_block_size_kernel<scalar_t><<<1, num_experts, 0, stream>>>( - topk_ids.data_ptr<scalar_t>(), - sorted_token_ids.data_ptr<int32_t>(), - experts_ids.data_ptr<int32_t>(), - num_tokens_post_pad.data_ptr<int32_t>(), - num_experts, - block_size, - topk_ids.numel()); - }); -} diff --git a/csrc/ops.h b/csrc/ops.h deleted file mode 100644 index 2bcd0c2efc5c6b029c7605f456898b2902f4d100..0000000000000000000000000000000000000000 --- a/csrc/ops.h +++ /dev/null @@ -1,130 +0,0 @@ -#pragma once - -#include <torch/extension.h> - -void paged_attention_v1( - torch::Tensor& out, - torch::Tensor& query, - torch::Tensor& key_cache, - torch::Tensor& value_cache, - int num_kv_heads, - float scale, - torch::Tensor& block_tables, - torch::Tensor& context_lens, - int block_size, - int max_context_len, - const c10::optional<torch::Tensor>& alibi_slopes, - const std::string& kv_cache_dtype); - -void paged_attention_v2( - torch::Tensor& out, - torch::Tensor& exp_sums, - torch::Tensor& max_logits, - torch::Tensor& tmp_out, - torch::Tensor& query, - torch::Tensor& key_cache, - torch::Tensor& value_cache, - int num_kv_heads, - float scale, - torch::Tensor& block_tables, - torch::Tensor& context_lens, - int block_size, - int max_context_len, - const c10::optional<torch::Tensor>& alibi_slopes, - const std::string& kv_cache_dtype); - -void rms_norm( - torch::Tensor& out, - torch::Tensor& input, - torch::Tensor& weight, - float epsilon); - -void fused_add_rms_norm( - torch::Tensor& input, - torch::Tensor& residual, - torch::Tensor& weight, - float epsilon); - -void rotary_embedding( - torch::Tensor& positions, - torch::Tensor& query, - torch::Tensor& key, - int head_size, - torch::Tensor& cos_sin_cache, - bool is_neox); - -void silu_and_mul( - torch::Tensor& out, - torch::Tensor& input); - -void gelu_new( - torch::Tensor& out, - torch::Tensor& input); - -void gelu_fast( - torch::Tensor& out, - torch::Tensor& input); - -#ifndef USE_ROCM -torch::Tensor awq_gemm( - torch::Tensor _in_feats, - torch::Tensor _kernel, - torch::Tensor _scaling_factors, - torch::Tensor _zeros, - int split_k_iters); - -torch::Tensor awq_dequantize( - torch::Tensor _kernel, - torch::Tensor _scaling_factors, - torch::Tensor _zeros, - int split_k_iters, - int thx, - int thy); -#endif - -void squeezellm_gemm( - torch::Tensor vec, - torch::Tensor mat, - torch::Tensor mul, - torch::Tensor lookup_table); - -torch::Tensor gptq_gemm( - torch::Tensor a, - torch::Tensor b_q_weight, - torch::Tensor b_gptq_qzeros, - torch::Tensor b_gptq_scales, - torch::Tensor b_g_idx, - bool use_exllama); - -void gptq_shuffle( - torch::Tensor q_weight, - torch::Tensor q_perm); - -void moe_align_block_size( - torch::Tensor topk_ids, - int num_experts, - int block_size, - torch::Tensor sorted_token_ids, - torch::Tensor experts_ids, - torch::Tensor num_tokens_post_pad); - -#ifndef USE_ROCM -using fptr_t = uint64_t; -fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data, - const std::vector<std::string> &handles, - const std::vector<int64_t> &offsets, int rank, - bool full_nvlink); -bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size, - bool full_nvlink); -void all_reduce_reg(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out); -void all_reduce_unreg(fptr_t _fa, torch::Tensor &inp, torch::Tensor ®_buffer, - torch::Tensor &out); -void dispose(fptr_t _fa); -int meta_size(); -void register_buffer(fptr_t _fa, torch::Tensor &t, - const std::vector<std::string> &handles, - const std::vector<int64_t> &offsets); -std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(fptr_t _fa); -void register_graph_buffers(fptr_t _fa, const std::vector<std::string> &handles, - const std::vector<std::vector<int64_t>> &offsets); -#endif diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu deleted file mode 100644 index 5f522795619e13a7514c55152d96bc379276d3af..0000000000000000000000000000000000000000 --- a/csrc/pos_encoding_kernels.cu +++ /dev/null @@ -1,130 +0,0 @@ -#include <torch/extension.h> -#include <ATen/cuda/CUDAContext.h> -#include <c10/cuda/CUDAGuard.h> - -#include "cuda_compat.h" -#include "dispatch_utils.h" - -namespace vllm { - -template<typename scalar_t, bool IS_NEOX> -inline __device__ void apply_rotary_embedding( - scalar_t* __restrict__ arr, - const scalar_t* __restrict__ cos_ptr, - const scalar_t* __restrict__ sin_ptr, - int rot_offset, - int embed_dim) -{ - int x_index, y_index; - scalar_t cos, sin; - if (IS_NEOX) { - // GPT-NeoX style rotary embedding. - x_index = rot_offset; - y_index = embed_dim + rot_offset; - cos = VLLM_LDG(cos_ptr + x_index); - sin = VLLM_LDG(sin_ptr + x_index); - } else { - // GPT-J style rotary embedding. - x_index = 2 * rot_offset; - y_index = 2 * rot_offset + 1; - cos = VLLM_LDG(cos_ptr + x_index / 2); - sin = VLLM_LDG(sin_ptr + x_index / 2); - } - - const scalar_t x = arr[x_index]; - const scalar_t y = arr[y_index]; - arr[x_index] = x * cos - y * sin; - arr[y_index] = y * cos + x * sin; -} - -template<typename scalar_t, bool IS_NEOX> -__global__ void rotary_embedding_kernel( - const int64_t* __restrict__ positions, // [batch_size, seq_len] or [num_tokens] - scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size] - scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size] - const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] - const int rot_dim, - const int64_t query_stride, - const int64_t key_stride, - const int num_heads, - const int num_kv_heads, - const int head_size) { - // Each thread block is responsible for one token. - const int token_idx = blockIdx.x; - int64_t pos = positions[token_idx]; - const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; - - const int embed_dim = rot_dim / 2; - const scalar_t* cos_ptr = cache_ptr; - const scalar_t* sin_ptr = cache_ptr + embed_dim; - - const int nq = num_heads * embed_dim; - for (int i = threadIdx.x; i < nq; i += blockDim.x) { - const int head_idx = i / embed_dim; - const int64_t token_head = token_idx * query_stride + head_idx * head_size; - const int rot_offset = i % embed_dim; - apply_rotary_embedding<scalar_t, IS_NEOX>(query + token_head, cos_ptr, - sin_ptr, rot_offset, embed_dim); - } - - const int nk = num_kv_heads * embed_dim; - for (int i = threadIdx.x; i < nk; i += blockDim.x) { - const int head_idx = i / embed_dim; - const int64_t token_head = token_idx * key_stride + head_idx * head_size; - const int rot_offset = i % embed_dim; - apply_rotary_embedding<scalar_t, IS_NEOX>(key + token_head, cos_ptr, - sin_ptr, rot_offset, embed_dim); - } -} - -} // namespace vllm - -void rotary_embedding( - torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens] - torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size] - torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size] - int head_size, - torch::Tensor& cos_sin_cache, // [max_position, rot_dim] - bool is_neox) { - int64_t num_tokens = query.numel() / query.size(-1); - int rot_dim = cos_sin_cache.size(1); - int num_heads = query.size(-1) / head_size; - int num_kv_heads = key.size(-1) / head_size; - int64_t query_stride = query.stride(-2); - int64_t key_stride = key.stride(-2); - - dim3 grid(num_tokens); - dim3 block(std::min(num_heads * rot_dim / 2, 512)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES( - query.scalar_type(), - "rotary_embedding", - [&] { - if (is_neox) { - vllm::rotary_embedding_kernel<scalar_t, true><<<grid, block, 0, stream>>>( - positions.data_ptr<int64_t>(), - query.data_ptr<scalar_t>(), - key.data_ptr<scalar_t>(), - cos_sin_cache.data_ptr<scalar_t>(), - rot_dim, - query_stride, - key_stride, - num_heads, - num_kv_heads, - head_size); - } else { - vllm::rotary_embedding_kernel<scalar_t, false><<<grid, block, 0, stream>>>( - positions.data_ptr<int64_t>(), - query.data_ptr<scalar_t>(), - key.data_ptr<scalar_t>(), - cos_sin_cache.data_ptr<scalar_t>(), - rot_dim, - query_stride, - key_stride, - num_heads, - num_kv_heads, - head_size); - } - }); -} diff --git a/csrc/punica/LICENSE b/csrc/punica/LICENSE deleted file mode 100644 index a46e2cdcadf7dc163eb9c329caff0c08d934f58c..0000000000000000000000000000000000000000 --- a/csrc/punica/LICENSE +++ /dev/null @@ -1,217 +0,0 @@ -Contains code from https://github.com/punica-ai/punica - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "{}" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright {yyyy} {name of copyright owner} - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ------------------------------------------------------------------------------------- - -This product bundles various third-party components under other open source licenses. -This section summarizes those components and their licenses. See licenses/ -for text of these licenses. - - -Apache-2.0 -* third_party/nvbench (with LLVM exception) -* third_party/flashinfer - -BSD-3-Clause: -* third_party/cutlass \ No newline at end of file diff --git a/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu deleted file mode 100644 index c642e94925fe5a13a80c82073239566ff619a787..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_bfloat16) diff --git a/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu b/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu deleted file mode 100644 index e8202dff561d9af0a062c2514d7dc0acc7e28212..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_half) diff --git a/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu deleted file mode 100644 index 3e7cf31dead0fd604d99f5761dcc3896e2346c86..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_bfloat16) diff --git a/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu deleted file mode 100644 index 68277fa6b7d56782560c64bdcdcd1022dee1af5a..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_half) diff --git a/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu deleted file mode 100644 index 0607cebfeac40055235d45c634f747aa2f0e2231..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_bfloat16) diff --git a/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu b/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu deleted file mode 100644 index 3b7531b8fbcfc222f0359c70f134eca4cffa002b..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_half) diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h deleted file mode 100644 index ebf638f104c3fb85483239966d91309a7d42bb8d..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_config.h +++ /dev/null @@ -1,59 +0,0 @@ -#pragma once - -template <int feat_in, int feat_out, typename in_T, typename out_T, - typename W_T> -void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, - const W_T *__restrict__ W, - const int64_t *__restrict__ indicies, int64_t y_offset, - int64_t full_y_size, int64_t batch_size, int64_t num_layers, - int64_t layer_idx, float scale); - -// clang-format off - -#define FOR_BGMV_WIDE(f, in_T, out_T, W_T, narrow) \ - f(in_T, out_T, W_T, narrow, 128) \ - f(in_T, out_T, W_T, narrow, 256) \ - f(in_T, out_T, W_T, narrow, 512) \ - f(in_T, out_T, W_T, narrow, 1024) \ - f(in_T, out_T, W_T, narrow, 1280) \ - f(in_T, out_T, W_T, narrow, 1728) \ - f(in_T, out_T, W_T, narrow, 1792) \ - f(in_T, out_T, W_T, narrow, 2048) \ - f(in_T, out_T, W_T, narrow, 2560) \ - f(in_T, out_T, W_T, narrow, 2752) \ - f(in_T, out_T, W_T, narrow, 3072) \ - f(in_T, out_T, W_T, narrow, 3456) \ - f(in_T, out_T, W_T, narrow, 3584) \ - f(in_T, out_T, W_T, narrow, 4096) \ - f(in_T, out_T, W_T, narrow, 5120) \ - f(in_T, out_T, W_T, narrow, 5504) \ - f(in_T, out_T, W_T, narrow, 5632) \ - f(in_T, out_T, W_T, narrow, 6912) \ - f(in_T, out_T, W_T, narrow, 7168) \ - f(in_T, out_T, W_T, narrow, 8192) \ - f(in_T, out_T, W_T, narrow, 9216) \ - f(in_T, out_T, W_T, narrow, 10240) \ - f(in_T, out_T, W_T, narrow, 11008) \ - f(in_T, out_T, W_T, narrow, 12288) \ - f(in_T, out_T, W_T, narrow, 13824) \ - f(in_T, out_T, W_T, narrow, 14336) \ - f(in_T, out_T, W_T, narrow, 16384) \ - f(in_T, out_T, W_T, narrow, 20480) \ - f(in_T, out_T, W_T, narrow, 28672) \ - f(in_T, out_T, W_T, narrow, 32000) \ - f(in_T, out_T, W_T, narrow, 32256) \ - f(in_T, out_T, W_T, narrow, 32512) \ - f(in_T, out_T, W_T, narrow, 32768) \ - f(in_T, out_T, W_T, narrow, 33024) \ - f(in_T, out_T, W_T, narrow, 36864) \ - f(in_T, out_T, W_T, narrow, 49152) \ -// Keep above in sync with vllm/lora/layers::SamplerWithLoRA - -// Keep this in sync with vllm/config::LoRAConfig -#define FOR_BGMV_WIDE_NARROW(f, in_T, out_T, W_T) \ - FOR_BGMV_WIDE(f, in_T, out_T, W_T, 8) \ - FOR_BGMV_WIDE(f, in_T, out_T, W_T, 16) \ - FOR_BGMV_WIDE(f, in_T, out_T, W_T, 32) \ - FOR_BGMV_WIDE(f, in_T, out_T, W_T, 64) - -// clang-format on diff --git a/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu deleted file mode 100644 index b3b74aa3ec904079dae217179c06c08aa1b887e3..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_bfloat16) diff --git a/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu deleted file mode 100644 index 3cc87f5df76a16595db7fbb4b6e0f081fe888a1f..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_half) diff --git a/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu b/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu deleted file mode 100644 index 9eda98bd8ddcf3d5ae6bb15158a2ff7f642b9831..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_bfloat16) diff --git a/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu deleted file mode 100644 index f1db6df5f7338b6a427cae99924f5fd6dcbe39f4..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_half) diff --git a/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu b/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu deleted file mode 100644 index 060f9ebb8c2b13ce0aab45a3c446f68babd813af..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_bfloat16) diff --git a/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu deleted file mode 100644 index c01ddd009d74ec23ae95f08836b9440b6d30e0b6..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_half) diff --git a/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu deleted file mode 100644 index f45183ffd3486de889bf0f1b8666667155621126..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_bfloat16) diff --git a/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu b/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu deleted file mode 100644 index b37e44570bf40c7afd054dcd7b53c16509ce7a82..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_half) diff --git a/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu b/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu deleted file mode 100644 index 06718cbb0a3e957fb91944c6466190fd6dabeee8..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_bfloat16) diff --git a/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu deleted file mode 100644 index 4097743488087d8176e1987fa7153700719aac34..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_half) diff --git a/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu b/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu deleted file mode 100644 index 41fb0e45ef4e604a3589711de3742c76feffcfb3..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_bfloat16) diff --git a/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu b/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu deleted file mode 100644 index 50b7ead9fcefd528d425d28a051368fd50c48457..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_half) diff --git a/csrc/punica/bgmv/bgmv_impl.cuh b/csrc/punica/bgmv/bgmv_impl.cuh deleted file mode 100644 index 995de26e8bada0cf6f9b58032ac4ddd8ab05c28b..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/bgmv_impl.cuh +++ /dev/null @@ -1,294 +0,0 @@ -#pragma once - -#include <ATen/cuda/CUDAContext.h> -#include <cooperative_groups.h> -#include <cuda/pipeline> -#include <cuda_runtime.h> -#include <iostream> -#include <stdio.h> - -#include "vec_dtypes.cuh" - -namespace cg = cooperative_groups; - -// nthrs = (32, 4) -template <int feat_in, int feat_out, size_t vec_size, size_t X_copy_size, - size_t W_copy_size, int tx, int ty, int tz, typename in_T, - typename out_T, typename W_T> -__global__ void -bgmv_shrink_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, - const W_T *__restrict__ W, - const int64_t *__restrict__ indicies, int64_t y_offset, - int64_t full_y_size, int64_t num_layers, int64_t layer_idx, - float scale) { - size_t batch_idx = blockIdx.y; - int64_t idx = indicies[batch_idx] * num_layers + layer_idx; - if (idx < 0) { - return; - } - - auto block = cg::this_thread_block(); - size_t j = blockIdx.x; - constexpr size_t num_pipeline_stages = 2; - constexpr size_t tile_size = tx * ty * vec_size; - __shared__ W_T W_shared[num_pipeline_stages * tile_size]; - __shared__ in_T X_shared[num_pipeline_stages * tile_size]; - __shared__ float y_warpwise[ty]; - - size_t W_shared_offset[num_pipeline_stages] = {0U, 1U * tile_size}; - size_t X_shared_offset[num_pipeline_stages] = {0U, 1U * tile_size}; - auto pipe = cuda::make_pipeline(); - - // pipeline load W/X and compute WX; - pipe.producer_acquire(); - cuda::memcpy_async(W_shared + (threadIdx.y * tx + threadIdx.x) * vec_size, - W + (idx * feat_out + j) * feat_in + - (threadIdx.y * tx + threadIdx.x) * vec_size, - cuda::aligned_size_t<W_copy_size>(W_copy_size), pipe); - cuda::memcpy_async(X_shared + (threadIdx.y * tx + threadIdx.x) * vec_size, - X + (batch_idx * feat_in) + - (threadIdx.y * tx + threadIdx.x) * vec_size, - cuda::aligned_size_t<X_copy_size>(X_copy_size), pipe); - pipe.producer_commit(); - size_t copy_idx, compute_idx; - float y = 0.f; - vec_t<in_T, vec_size> x_vec; - vec_t<W_T, vec_size> w_vec; - size_t tile_idx; - -#pragma unroll - for (tile_idx = 1; tile_idx < (feat_in + tile_size - 1) / tile_size; - ++tile_idx) { - copy_idx = tile_idx % num_pipeline_stages; - // pipeline stage: async copy W fragment - pipe.producer_acquire(); - if (tile_idx * tile_size + threadIdx.y * tx * vec_size < feat_in) { - cuda::memcpy_async(W_shared + W_shared_offset[copy_idx] + - (threadIdx.y * tx + threadIdx.x) * vec_size, - W + (idx * feat_out + j) * feat_in + - tile_idx * tile_size + - (threadIdx.y * tx + threadIdx.x) * vec_size, - cuda::aligned_size_t<W_copy_size>(W_copy_size), pipe); - cuda::memcpy_async(X_shared + X_shared_offset[copy_idx] + - (threadIdx.y * tx + threadIdx.x) * vec_size, - X + (batch_idx * feat_in) + tile_idx * tile_size + - (threadIdx.y * tx + threadIdx.x) * vec_size, - cuda::aligned_size_t<X_copy_size>(X_copy_size), pipe); - } - pipe.producer_commit(); - - compute_idx = (tile_idx - 1) % num_pipeline_stages; - // pipeline stage: compute WX - pipe.consumer_wait(); - block.sync(); - x_vec.load(X_shared + X_shared_offset[compute_idx] + - (threadIdx.y * tx + threadIdx.x) * vec_size); - w_vec.load(W_shared + W_shared_offset[compute_idx] + - (threadIdx.y * tx + threadIdx.x) * vec_size); - float sum = 0.f; -#pragma unroll - for (size_t i = 0; i < vec_size; ++i) { - sum += float(w_vec[i]) * float(x_vec[i]) * scale; - } -#pragma unroll - for (size_t offset = tx / 2; offset > 0; offset /= 2) { - sum += __shfl_down_sync(0xffffffff, sum, offset); - } - y_warpwise[threadIdx.y] = sum; - block.sync(); -#pragma unroll - for (size_t i = 0; i < ty; ++i) { - y += y_warpwise[i]; - } - - block.sync(); - pipe.consumer_release(); - } - - compute_idx = (tile_idx - 1) % num_pipeline_stages; - // final pipeline stage - pipe.consumer_wait(); - block.sync(); - x_vec.load(X_shared + X_shared_offset[compute_idx] + - (threadIdx.y * tx + threadIdx.x) * vec_size); - w_vec.load(W_shared + W_shared_offset[compute_idx] + - (threadIdx.y * tx + threadIdx.x) * vec_size); - float sum = 0.f; -#pragma unroll - for (size_t i = 0; i < vec_size; ++i) { - sum += float(w_vec[i]) * float(x_vec[i]) * scale; - } -#pragma unroll - for (size_t offset = tx / 2; offset > 0; offset /= 2) { - sum += __shfl_down_sync(0xffffffff, sum, offset); - } - y_warpwise[threadIdx.y] = - ((tile_idx - 1) * tile_size + threadIdx.y * tx * vec_size < feat_in) - ? sum - : 0.f; - block.sync(); -#pragma unroll - for (size_t i = 0; i < ty; ++i) { - y += y_warpwise[i]; - } - - block.sync(); - pipe.consumer_release(); - - // write Y; - if (block.thread_rank() == 0) { - Y[batch_idx * full_y_size + y_offset + j] += static_cast<out_T>(y); - } -} - -// nthrs = (2, 16, 4) -template <int feat_in, int feat_out, size_t vec_size, int tx, int ty, int tz, - typename in_T, typename out_T, typename W_T> -__global__ void -bgmv_expand_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, - const W_T *__restrict__ W, - const int64_t *__restrict__ indicies, int64_t y_offset, - int64_t full_y_size, int64_t num_layers, int64_t layer_idx, - float scale) { - size_t batch_idx = blockIdx.y; - int64_t idx = indicies[batch_idx] * num_layers + layer_idx; - - if (idx < 0) { - return; - } - - auto block = cg::this_thread_block(); - size_t tile_idx = blockIdx.x; - - // load X; - vec_t<in_T, vec_size> x_vec; - x_vec.load(X + batch_idx * feat_in + threadIdx.x * vec_size); - - // load W; - vec_t<W_T, vec_size> w_vec; - w_vec.load(W + (idx * feat_out + tile_idx * tz * ty) * feat_in + - block.thread_rank() * vec_size); - - float sum = 0.f; -#pragma unroll - for (size_t i = 0; i < vec_size; ++i) { - sum += float(w_vec[i]) * float(x_vec[i]) * scale; - } - - cg::thread_block_tile g = cg::tiled_partition<tx>(block); -#pragma unroll - for (size_t offset = tx / 2; offset > 0; offset /= 2) { - sum += g.shfl_down(sum, offset); - } - sum = g.shfl(sum, 0); - - if (threadIdx.x == 0) { - Y[batch_idx * full_y_size + y_offset + tile_idx * (tz * ty) + - threadIdx.z * ty + threadIdx.y] += static_cast<out_T>(sum); - } -} - -template <int feat_in, int feat_out, typename in_T, typename out_T, - typename W_T> -void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, - const W_T *__restrict__ W, - const int64_t *__restrict__ indicies, int64_t y_offset, - int64_t full_y_size, int64_t batch_size, int64_t num_layers, - int64_t layer_idx, float scale) { - constexpr size_t vec_size = 8; - constexpr int tz = 4; - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - - if constexpr (feat_in < feat_out) { - static_assert(feat_in % vec_size == 0); - constexpr int tx = feat_in / vec_size; - - static_assert((32 % tx == 0 && feat_out % (32 / tx * tz) == 0) || - (16 % tx == 0 && feat_out % (16 / tx * tz) == 0) || - (8 % tx == 0 && feat_out % (8 / tx * tz) == 0)); - - if constexpr (32 % tx == 0 && feat_out % (32 / tx * tz) == 0) { - constexpr int ty = 32 / tx; - dim3 nblks(feat_out / (ty * tz), batch_size); - dim3 nthrs(tx, ty, tz); - - bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz> - <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset, - full_y_size, num_layers, layer_idx, - scale); - } else if (16 % tx == 0 && feat_out % (16 / tx * tz) == 0) { - constexpr int ty = 16 / tx; - dim3 nblks(feat_out / (ty * tz), batch_size); - dim3 nthrs(tx, ty, tz); - - bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz> - <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset, - full_y_size, num_layers, layer_idx, - scale); - } else { - constexpr int ty = 8 / tx; - dim3 nblks(feat_out / (ty * tz), batch_size); - dim3 nthrs(tx, ty, tz); - - bgmv_expand_kernel<feat_in, feat_out, vec_size, tx, ty, tz> - <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset, - full_y_size, num_layers, layer_idx, - scale); - } - } else { - static_assert(feat_in % (vec_size * 32) == 0 || - feat_in % (vec_size * 16) == 0 || - feat_in % (vec_size * 8) == 0); - - if constexpr (feat_in % (vec_size * 32) == 0) { - constexpr int tx = 32; - constexpr int ty = 4; - - dim3 nblks(feat_out, batch_size); - dim3 nthrs(tx, ty); - - bgmv_shrink_kernel<feat_in, feat_out, vec_size, vec_size * sizeof(in_T), - vec_size * sizeof(W_T), tx, ty, tz> - <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset, - full_y_size, num_layers, layer_idx, - scale); - } else if constexpr (feat_in % (vec_size / 2 * 32) == 0) { - constexpr int tx = 32; - constexpr int ty = 4; - - dim3 nblks(feat_out, batch_size); - dim3 nthrs(tx, ty); - - bgmv_shrink_kernel<feat_in, feat_out, vec_size / 2, - vec_size * sizeof(in_T) / 2, - vec_size * sizeof(W_T) / 2, tx, ty, tz> - <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset, - full_y_size, num_layers, layer_idx, - scale); - } else if constexpr (feat_in % (vec_size / 2 * 16) == 0) { - constexpr int tx = 16; - constexpr int ty = 4; - - dim3 nblks(feat_out, batch_size); - dim3 nthrs(tx, ty); - - bgmv_shrink_kernel<feat_in, feat_out, vec_size / 2, - vec_size * sizeof(in_T) / 2, - vec_size * sizeof(W_T) / 2, tx, ty, tz> - <<<nblks, nthrs, 0, stream>>>(Y, X, W, indicies, y_offset, - full_y_size, num_layers, layer_idx, - scale); - } - } -} - -#define INST_BGMV(feat_in, feat_out, in_T, out_T, W_T) \ - template void bgmv_kernel<feat_in, feat_out>( \ - out_T * __restrict__ Y, const in_T *__restrict__ X, \ - const W_T *__restrict__ W, const int64_t *__restrict__ indicies, \ - int64_t y_offset, int64_t full_y_size, int64_t batch_size, \ - int64_t num_layers, int64_t layer_idx, float scale); - -#define INST_BGMV_TWOSIDE(in_T, out_T, W_T, narrow, wide) \ - INST_BGMV(narrow, wide, in_T, out_T, W_T) \ - INST_BGMV(wide, narrow, in_T, out_T, W_T) diff --git a/csrc/punica/bgmv/generator.py b/csrc/punica/bgmv/generator.py deleted file mode 100644 index 66de56d74f3e71a2d80840c7c5f1dc1303b99269..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/generator.py +++ /dev/null @@ -1,27 +0,0 @@ -DTYPES = ["fp16", "bf16", "fp32"] -DTYPE_MAP = { - "fp16": "nv_half", - "bf16": "nv_bfloat16", - "fp32": "float", -} - -TEMPLATE = """ -#include "bgmv_config.h" -#include "bgmv_impl.cuh" - -FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype}) -""".lstrip() - -for input_dtype in DTYPES: - for output_dtype in DTYPES: - for weight_dtype in DTYPES: - if weight_dtype == "fp32": - # FP32 weights are not supported. - continue - kernel_definition = TEMPLATE.format( - input_dtype=DTYPE_MAP[input_dtype], - output_dtype=DTYPE_MAP[output_dtype], - weight_dtype=DTYPE_MAP[weight_dtype]) - filename = f"bgmv_{input_dtype}_{output_dtype}_{weight_dtype}.cu" - with open(filename, "w") as f: - f.write(kernel_definition) diff --git a/csrc/punica/bgmv/vec_dtypes.cuh b/csrc/punica/bgmv/vec_dtypes.cuh deleted file mode 100644 index cf00d869cf635cfc49ab43a39e84da6bd3b92b2d..0000000000000000000000000000000000000000 --- a/csrc/punica/bgmv/vec_dtypes.cuh +++ /dev/null @@ -1,1324 +0,0 @@ -#ifndef VEC_DTYPES_CUH_ -#define VEC_DTYPES_CUH_ - -#include <cuda_bf16.h> -#include <cuda_fp16.h> -#ifdef FLASHINFER_USE_FP8 -#include <cuda_fp8.h> -#endif -#include <cuda_runtime.h> - -#include <type_traits> - -#define FLASHINFER_INLINE \ - inline __attribute__((always_inline)) __device__ __host__ - -template <typename float_t, size_t vec_size> -struct vec_t { - FLASHINFER_INLINE float_t &operator[](size_t i); - FLASHINFER_INLINE const float_t &operator[](size_t i) const; - FLASHINFER_INLINE void fill(float_t val); - FLASHINFER_INLINE void load(const float_t *ptr); - FLASHINFER_INLINE void store(float_t *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src); - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr); - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const; - FLASHINFER_INLINE static void memcpy(float_t *dst, const float_t *src); -}; - -template <typename src_float_t, typename tgt_float_t, size_t vec_size> -FLASHINFER_INLINE void cast_from_impl(const vec_t<src_float_t, vec_size> &src, - vec_t<tgt_float_t, vec_size> &dst) { -#pragma unroll - for (size_t i = 0; i < vec_size; ++i) { - dst[i] = tgt_float_t(src[i]); - } -} - -template <typename src_float_t, typename tgt_float_t, size_t vec_size> -FLASHINFER_INLINE void cast_load_impl(const src_float_t *src_ptr, - vec_t<tgt_float_t, vec_size> &dst) { - if constexpr (std::is_same<src_float_t, tgt_float_t>::value) { - dst.load(src_ptr); - } else { - vec_t<src_float_t, vec_size> tmp; - tmp.load(src_ptr); - dst.cast_from(tmp); - } -} - -template <typename src_float_t, typename tgt_float_t, size_t vec_size> -FLASHINFER_INLINE void cast_store_impl(const vec_t<src_float_t, vec_size> &src, - tgt_float_t *dst_ptr) { - if constexpr (std::is_same<src_float_t, tgt_float_t>::value) { - src.store(dst_ptr); - } else { - vec_t<tgt_float_t, vec_size> tmp; - tmp.cast_from(src); - tmp.store(dst_ptr); - } -} - -#ifdef FLASHINFER_USE_FP8 -/******************* vec_t<__nv_fp8_e4m3> *******************/ - -// __nv_fp8_e4m3 x 1 -template <> -struct vec_t<__nv_fp8_e4m3, 1> { - __nv_fp8_e4m3 data; - - FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) { - return ((__nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const { - return ((const __nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val); - FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst, - const __nv_fp8_e4m3 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::fill(__nv_fp8_e4m3 val) { - data = val; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::load(const __nv_fp8_e4m3 *ptr) { - data = *ptr; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::store( - __nv_fp8_e4m3 *ptr) const { - *ptr = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 1>::memcpy( - __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) { - *dst = *src; -} - -// __nv_fp8_e4m3 x 2 -template <> -struct vec_t<__nv_fp8_e4m3, 2> { - __nv_fp8x2_e4m3 data; - - FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) { - return ((__nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const { - return ((const __nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val); - FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst, - const __nv_fp8_e4m3 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::fill(__nv_fp8_e4m3 val) { - data.__x = - (__nv_fp8x2_storage_t(val.__x) << 8) | __nv_fp8x2_storage_t(val.__x); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::load(const __nv_fp8_e4m3 *ptr) { - data = *((__nv_fp8x2_e4m3 *)ptr); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::store( - __nv_fp8_e4m3 *ptr) const { - *((__nv_fp8x2_e4m3 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 2>::memcpy( - __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) { - *((__nv_fp8x2_e4m3 *)dst) = *((__nv_fp8x2_e4m3 *)src); -} - -// __nv_fp8_e4m3 x 4 - -template <> -struct vec_t<__nv_fp8_e4m3, 4> { - __nv_fp8x4_e4m3 data; - - FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) { - return ((__nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const { - return ((const __nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val); - FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst, - const __nv_fp8_e4m3 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::fill(__nv_fp8_e4m3 val) { - data.__x = (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | - __nv_fp8x4_storage_t(val.__x); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::load(const __nv_fp8_e4m3 *ptr) { - data = *((__nv_fp8x4_e4m3 *)ptr); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::store( - __nv_fp8_e4m3 *ptr) const { - *((__nv_fp8x4_e4m3 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 4>::memcpy( - __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) { - *((__nv_fp8x4_e4m3 *)dst) = *((__nv_fp8x4_e4m3 *)src); -} - -// __nv_fp8_e4m3 x 8 - -template <> -struct vec_t<__nv_fp8_e4m3, 8> { - uint2 data; - - FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) { - return ((__nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const { - return ((const __nv_fp8_e4m3 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val); - FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 8> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst, - const __nv_fp8_e4m3 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::fill(__nv_fp8_e4m3 val) { - ((__nv_fp8x4_e4m3 *)(&data.x))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | - __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e4m3 *)(&data.y))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | - __nv_fp8x4_storage_t(val.__x); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::load(const __nv_fp8_e4m3 *ptr) { - data = *((uint2 *)ptr); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::store( - __nv_fp8_e4m3 *ptr) const { - *((uint2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e4m3, 8>::memcpy( - __nv_fp8_e4m3 *dst, const __nv_fp8_e4m3 *src) { - *((__nv_fp8_e4m3 *)dst) = *((__nv_fp8_e4m3 *)src); -} - -// __nv_fp8_e4m3 x 16 or more -template <size_t vec_size> -struct vec_t<__nv_fp8_e4m3, vec_size> { - uint4 data[vec_size / 16]; - - FLASHINFER_INLINE __nv_fp8_e4m3 &operator[](size_t i) { - return ((__nv_fp8_e4m3 *)data)[i]; - } - FLASHINFER_INLINE const __nv_fp8_e4m3 &operator[](size_t i) const { - return ((const __nv_fp8_e4m3 *)data)[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e4m3 val) { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - ((__nv_fp8x4_e4m3 *)(&(data[i].x)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e4m3 *)(&(data[i].y)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e4m3 *)(&(data[i].z)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e4m3 *)(&(data[i].w)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - } - } - FLASHINFER_INLINE void load(const __nv_fp8_e4m3 *ptr) { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - data[i] = ((uint4 *)ptr)[i]; - } - } - FLASHINFER_INLINE void store(__nv_fp8_e4m3 *ptr) const { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - ((uint4 *)ptr)[i] = data[i]; - } - } - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e4m3 *dst, - const __nv_fp8_e4m3 *src) { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - ((uint4 *)dst)[i] = ((uint4 *)src)[i]; - } - } -}; - -/******************* vec_t<__nv_fp8_e5m2> *******************/ - -// __nv_fp8_e5m2 x 1 -template <> -struct vec_t<__nv_fp8_e5m2, 1> { - __nv_fp8_e5m2 data; - - FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) { - return ((__nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const { - return ((const __nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val); - FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst, - const __nv_fp8_e5m2 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::fill(__nv_fp8_e5m2 val) { - data = val; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::load(const __nv_fp8_e5m2 *ptr) { - data = *ptr; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::store( - __nv_fp8_e5m2 *ptr) const { - *ptr = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 1>::memcpy( - __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) { - *dst = *src; -} - -// __nv_fp8_e5m2 x 2 -template <> -struct vec_t<__nv_fp8_e5m2, 2> { - __nv_fp8x2_e5m2 data; - - FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) { - return ((__nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const { - return ((const __nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val); - FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst, - const __nv_fp8_e5m2 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::fill(__nv_fp8_e5m2 val) { - data.__x = - (__nv_fp8x2_storage_t(val.__x) << 8) | __nv_fp8x2_storage_t(val.__x); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::load(const __nv_fp8_e5m2 *ptr) { - data = *((__nv_fp8x2_e5m2 *)ptr); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::store( - __nv_fp8_e5m2 *ptr) const { - *((__nv_fp8x2_e5m2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 2>::memcpy( - __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) { - *((__nv_fp8x2_e5m2 *)dst) = *((__nv_fp8x2_e5m2 *)src); -} - -// __nv_fp8_e5m2 x 4 - -template <> -struct vec_t<__nv_fp8_e5m2, 4> { - __nv_fp8x4_e5m2 data; - - FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) { - return ((__nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const { - return ((const __nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val); - FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst, - const __nv_fp8_e5m2 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::fill(__nv_fp8_e5m2 val) { - data.__x = (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | - __nv_fp8x4_storage_t(val.__x); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::load(const __nv_fp8_e5m2 *ptr) { - data = *((__nv_fp8x4_e5m2 *)ptr); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::store( - __nv_fp8_e5m2 *ptr) const { - *((__nv_fp8x4_e5m2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 4>::memcpy( - __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) { - *((__nv_fp8x4_e5m2 *)dst) = *((__nv_fp8x4_e5m2 *)src); -} - -// __nv_fp8_e5m2 x 8 - -template <> -struct vec_t<__nv_fp8_e5m2, 8> { - uint2 data; - - FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) { - return ((__nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const { - return ((const __nv_fp8_e5m2 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val); - FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr); - FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 8> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst, - const __nv_fp8_e5m2 *src); -}; - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::fill(__nv_fp8_e5m2 val) { - ((__nv_fp8x4_e5m2 *)(&data.x))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | - __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e5m2 *)(&data.y))->__x = (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | - __nv_fp8x4_storage_t(val.__x); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::load(const __nv_fp8_e5m2 *ptr) { - data = *((uint2 *)ptr); -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::store( - __nv_fp8_e5m2 *ptr) const { - *((uint2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<__nv_fp8_e5m2, 8>::memcpy( - __nv_fp8_e5m2 *dst, const __nv_fp8_e5m2 *src) { - *((__nv_fp8_e5m2 *)dst) = *((__nv_fp8_e5m2 *)src); -} - -// __nv_fp8_e5m2 x 16 or more - -template <size_t vec_size> -struct vec_t<__nv_fp8_e5m2, vec_size> { - uint4 data[vec_size / 16]; - - FLASHINFER_INLINE __nv_fp8_e5m2 &operator[](size_t i) { - return ((__nv_fp8_e5m2 *)data)[i]; - } - FLASHINFER_INLINE const __nv_fp8_e5m2 &operator[](size_t i) const { - return ((const __nv_fp8_e5m2 *)data)[i]; - } - FLASHINFER_INLINE void fill(__nv_fp8_e5m2 val) { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - ((__nv_fp8x4_e5m2 *)(&(data[i].x)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e5m2 *)(&(data[i].y)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e5m2 *)(&(data[i].z)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - ((__nv_fp8x4_e5m2 *)(&(data[i].w)))->__x = - (__nv_fp8x4_storage_t(val.__x) << 24) | - (__nv_fp8x4_storage_t(val.__x) << 16) | - (__nv_fp8x4_storage_t(val.__x) << 8) | __nv_fp8x4_storage_t(val.__x); - } - } - FLASHINFER_INLINE void load(const __nv_fp8_e5m2 *ptr) { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - data[i] = ((uint4 *)ptr)[i]; - } - } - FLASHINFER_INLINE void store(__nv_fp8_e5m2 *ptr) const { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - ((uint4 *)ptr)[i] = data[i]; - } - } - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(__nv_fp8_e5m2 *dst, - const __nv_fp8_e5m2 *src) { -#pragma unroll - for (size_t i = 0; i < vec_size / 16; ++i) { - ((uint4 *)dst)[i] = ((uint4 *)src)[i]; - } - } -}; -#endif - -/******************* vec_t<half> *******************/ - -// half x 1 -template <> -struct vec_t<half, 1> { - half data; - - FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)(&data))[i]; } - FLASHINFER_INLINE const half &operator[](size_t i) const { - return ((const half *)(&data))[i]; - } - FLASHINFER_INLINE void fill(half val); - FLASHINFER_INLINE void load(const half *ptr); - FLASHINFER_INLINE void store(half *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(half *dst, const half *src); -}; - -FLASHINFER_INLINE void vec_t<half, 1>::fill(half val) { data = val; } - -FLASHINFER_INLINE void vec_t<half, 1>::load(const half *ptr) { data = *ptr; } - -FLASHINFER_INLINE void vec_t<half, 1>::store(half *ptr) const { *ptr = data; } - -FLASHINFER_INLINE void vec_t<half, 1>::memcpy(half *dst, const half *src) { - *dst = *src; -} - -// half x 2 -template <> -struct vec_t<half, 2> { - half2 data; - - FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)(&data))[i]; } - FLASHINFER_INLINE const half &operator[](size_t i) const { - return ((const half *)(&data))[i]; - } - FLASHINFER_INLINE void fill(half val); - FLASHINFER_INLINE void load(const half *ptr); - FLASHINFER_INLINE void store(half *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(half *dst, const half *src); -}; - -FLASHINFER_INLINE void vec_t<half, 2>::fill(half val) { - data = make_half2(val, val); -} - -FLASHINFER_INLINE void vec_t<half, 2>::load(const half *ptr) { - data = *((half2 *)ptr); -} - -FLASHINFER_INLINE void vec_t<half, 2>::store(half *ptr) const { - *((half2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<half, 2>::memcpy(half *dst, const half *src) { - *((half2 *)dst) = *((half2 *)src); -} - -// half x 4 - -template <> -struct vec_t<half, 4> { - uint2 data; - - FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)(&data))[i]; } - FLASHINFER_INLINE const half &operator[](size_t i) const { - return ((const half *)(&data))[i]; - } - FLASHINFER_INLINE void fill(half val); - FLASHINFER_INLINE void load(const half *ptr); - FLASHINFER_INLINE void store(half *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(half *dst, const half *src); -}; - -FLASHINFER_INLINE void vec_t<half, 4>::fill(half val) { - *(half2 *)(&data.x) = make_half2(val, val); - *(half2 *)(&data.y) = make_half2(val, val); -} - -FLASHINFER_INLINE void vec_t<half, 4>::load(const half *ptr) { - data = *((uint2 *)ptr); -} - -FLASHINFER_INLINE void vec_t<half, 4>::store(half *ptr) const { - *((uint2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<half, 4>::memcpy(half *dst, const half *src) { - *((uint2 *)dst) = *((uint2 *)src); -} - -// half x 8 or more - -template <size_t vec_size> -struct vec_t<half, vec_size> { - uint4 data[vec_size / 8]; - FLASHINFER_INLINE half &operator[](size_t i) { return ((half *)data)[i]; } - FLASHINFER_INLINE const half &operator[](size_t i) const { - return ((const half *)data)[i]; - } - FLASHINFER_INLINE void fill(half val) { -#pragma unroll - for (size_t i = 0; i < vec_size; ++i) { - *(half2 *)(&(data[i].x)) = make_half2(val, val); - *(half2 *)(&(data[i].y)) = make_half2(val, val); - *(half2 *)(&(data[i].z)) = make_half2(val, val); - *(half2 *)(&(data[i].w)) = make_half2(val, val); - } - } - FLASHINFER_INLINE void load(const half *ptr) { -#pragma unroll - for (size_t i = 0; i < vec_size / 8; ++i) { - data[i] = ((uint4 *)ptr)[i]; - } - } - FLASHINFER_INLINE void store(half *ptr) const { -#pragma unroll - for (size_t i = 0; i < vec_size / 8; ++i) { - ((uint4 *)ptr)[i] = data[i]; - } - } - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(half *dst, const half *src) { -#pragma unroll - for (size_t i = 0; i < vec_size / 8; ++i) { - ((uint4 *)dst)[i] = ((uint4 *)src)[i]; - } - } -}; - -/******************* vec_t<nv_bfloat16> *******************/ - -// nv_bfloat16 x 1 -template <> -struct vec_t<nv_bfloat16, 1> { - nv_bfloat16 data; - - FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) { - return ((nv_bfloat16 *)(&data))[i]; - } - FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const { - return ((const nv_bfloat16 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(nv_bfloat16 val); - FLASHINFER_INLINE void load(const nv_bfloat16 *ptr); - FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst, - const nv_bfloat16 *src); -}; - -FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::fill(nv_bfloat16 val) { - data = val; -} - -FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::load(const nv_bfloat16 *ptr) { - data = *ptr; -} - -FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::store(nv_bfloat16 *ptr) const { - *ptr = data; -} - -FLASHINFER_INLINE void vec_t<nv_bfloat16, 1>::memcpy(nv_bfloat16 *dst, - const nv_bfloat16 *src) { - *dst = *src; -} - -// nv_bfloat16 x 2 -template <> -struct vec_t<nv_bfloat16, 2> { - nv_bfloat162 data; - - FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) { - return ((nv_bfloat16 *)(&data))[i]; - } - FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const { - return ((const nv_bfloat16 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(nv_bfloat16 val); - FLASHINFER_INLINE void load(const nv_bfloat16 *ptr); - FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst, - const nv_bfloat16 *src); -}; - -FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::fill(nv_bfloat16 val) { - data = make_bfloat162(val, val); -} - -FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::load(const nv_bfloat16 *ptr) { - data = *((nv_bfloat162 *)ptr); -} - -FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::store(nv_bfloat16 *ptr) const { - *((nv_bfloat162 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<nv_bfloat16, 2>::memcpy(nv_bfloat16 *dst, - const nv_bfloat16 *src) { - *((nv_bfloat162 *)dst) = *((nv_bfloat162 *)src); -} - -// nv_bfloat16 x 4 - -template <> -struct vec_t<nv_bfloat16, 4> { - uint2 data; - - FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) { - return ((nv_bfloat16 *)(&data))[i]; - } - FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const { - return ((const nv_bfloat16 *)(&data))[i]; - } - FLASHINFER_INLINE void fill(nv_bfloat16 val); - FLASHINFER_INLINE void load(const nv_bfloat16 *ptr); - FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 4> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst, - const nv_bfloat16 *src); -}; - -FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::fill(nv_bfloat16 val) { - *(nv_bfloat162 *)(&data.x) = make_bfloat162(val, val); - *(nv_bfloat162 *)(&data.y) = make_bfloat162(val, val); -} - -FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::load(const nv_bfloat16 *ptr) { - data = *((uint2 *)ptr); -} - -FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::store(nv_bfloat16 *ptr) const { - *((uint2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<nv_bfloat16, 4>::memcpy(nv_bfloat16 *dst, - const nv_bfloat16 *src) { - *((uint2 *)dst) = *((uint2 *)src); -} - -// nv_bfloat16 x 8 or more - -template <size_t vec_size> -struct vec_t<nv_bfloat16, vec_size> { - uint4 data[vec_size / 8]; - - FLASHINFER_INLINE nv_bfloat16 &operator[](size_t i) { - return ((nv_bfloat16 *)data)[i]; - } - FLASHINFER_INLINE const nv_bfloat16 &operator[](size_t i) const { - return ((const nv_bfloat16 *)data)[i]; - } - FLASHINFER_INLINE void fill(nv_bfloat16 val) { -#pragma unoll - for (size_t i = 0; i < vec_size / 8; ++i) { - *(nv_bfloat162 *)(&(data[i].x)) = make_bfloat162(val, val); - *(nv_bfloat162 *)(&(data[i].y)) = make_bfloat162(val, val); - *(nv_bfloat162 *)(&(data[i].z)) = make_bfloat162(val, val); - *(nv_bfloat162 *)(&(data[i].w)) = make_bfloat162(val, val); - } - } - FLASHINFER_INLINE void load(const nv_bfloat16 *ptr) { -#pragma unoll - for (size_t i = 0; i < vec_size / 8; ++i) { - data[i] = ((uint4 *)ptr)[i]; - } - } - FLASHINFER_INLINE void store(nv_bfloat16 *ptr) const { -#pragma unoll - for (size_t i = 0; i < vec_size / 8; ++i) { - ((uint4 *)ptr)[i] = data[i]; - } - } - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(nv_bfloat16 *dst, - const nv_bfloat16 *src) { -#pragma unoll - for (size_t i = 0; i < vec_size / 8; ++i) { - ((uint4 *)dst)[i] = ((uint4 *)src)[i]; - } - } -}; - -/******************* vec_t<float> *******************/ - -// float x 1 - -template <> -struct vec_t<float, 1> { - float data; - - FLASHINFER_INLINE float &operator[](size_t i) { - return ((float *)(&data))[i]; - } - FLASHINFER_INLINE const float &operator[](size_t i) const { - return ((const float *)(&data))[i]; - } - FLASHINFER_INLINE void fill(float val); - FLASHINFER_INLINE void load(const float *ptr); - FLASHINFER_INLINE void store(float *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 1> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - - FLASHINFER_INLINE static void memcpy(float *dst, const float *src); -}; - -FLASHINFER_INLINE void vec_t<float, 1>::fill(float val) { data = val; } - -FLASHINFER_INLINE void vec_t<float, 1>::load(const float *ptr) { data = *ptr; } - -FLASHINFER_INLINE void vec_t<float, 1>::store(float *ptr) const { *ptr = data; } - -FLASHINFER_INLINE void vec_t<float, 1>::memcpy(float *dst, const float *src) { - *dst = *src; -} - -// float x 2 - -template <> -struct vec_t<float, 2> { - float2 data; - - FLASHINFER_INLINE float &operator[](size_t i) { - return ((float *)(&data))[i]; - } - FLASHINFER_INLINE const float &operator[](size_t i) const { - return ((const float *)(&data))[i]; - } - FLASHINFER_INLINE void fill(float val); - FLASHINFER_INLINE void load(const float *ptr); - FLASHINFER_INLINE void store(float *ptr) const; - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, 2> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - FLASHINFER_INLINE static void memcpy(float *dst, const float *src); -}; - -FLASHINFER_INLINE void vec_t<float, 2>::fill(float val) { - data = make_float2(val, val); -} - -FLASHINFER_INLINE void vec_t<float, 2>::load(const float *ptr) { - data = *((float2 *)ptr); -} - -FLASHINFER_INLINE void vec_t<float, 2>::store(float *ptr) const { - *((float2 *)ptr) = data; -} - -FLASHINFER_INLINE void vec_t<float, 2>::memcpy(float *dst, const float *src) { - *((float2 *)dst) = *((float2 *)src); -} - -// float x 4 or more -template <size_t vec_size> -struct vec_t<float, vec_size> { - float4 data[vec_size / 4]; - - FLASHINFER_INLINE float &operator[](size_t i) { return ((float *)(data))[i]; } - FLASHINFER_INLINE const float &operator[](size_t i) const { - return ((const float *)(data))[i]; - } - FLASHINFER_INLINE void fill(float val) { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - data[i] = make_float4(val, val, val, val); - } - } - FLASHINFER_INLINE void load(const float *ptr) { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - data[i] = ((float4 *)ptr)[i]; - } - } - FLASHINFER_INLINE void store(float *ptr) const { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - ((float4 *)ptr)[i] = data[i]; - } - } - template <typename T> - FLASHINFER_INLINE void cast_from(const vec_t<T, vec_size> &src) { - cast_from_impl(src, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_load(const T *ptr) { - cast_load_impl(ptr, *this); - } - template <typename T> - FLASHINFER_INLINE void cast_store(T *ptr) const { - cast_store_impl(*this, ptr); - } - FLASHINFER_INLINE static void memcpy(float *dst, const float *src) { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - ((float4 *)dst)[i] = ((float4 *)src)[i]; - } - } -}; - -/******************* vec_t type cast *******************/ - -template <size_t vec_size> -FLASHINFER_INLINE void cast_from_impl(const vec_t<half, vec_size> &src, - vec_t<float, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = float(src.data); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 2; ++i) { - ((float2 *)(&dst.data))[i] = __half22float2(((half2 *)(&src.data))[i]); - } - } -} - -template <size_t vec_size> -FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src, - vec_t<half, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = half(src.data); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 2; ++i) { - ((half2 *)(&dst.data))[i] = __float22half2_rn(((float2 *)(&src.data))[i]); - } - } -} - -template <size_t vec_size> -FLASHINFER_INLINE void cast_from_impl(const vec_t<nv_bfloat16, vec_size> &src, - vec_t<float, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = float(src.data); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 2; ++i) { - ((float2 *)(&dst.data))[i] = - __bfloat1622float2(((nv_bfloat162 *)(&src.data))[i]); - } - } -} - -template <size_t vec_size> -FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src, - vec_t<nv_bfloat16, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = nv_bfloat16(src.data); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 2; ++i) { - ((nv_bfloat162 *)(&dst.data))[i] = - __float22bfloat162_rn(((float2 *)(&src.data))[i]); - } - } -} - -#ifdef FLASHINFER_USE_FP8 - -template <size_t vec_size> -FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e4m3, vec_size> &src, - vec_t<float, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = float(src.data); - } else if constexpr (vec_size == 2) { - *(float2 *)(&dst.data) = float2(*(__nv_fp8x2_e4m3 *)(&src.data)); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - ((float4 *)(&dst.data))[i] = float4(((__nv_fp8x4_e4m3 *)(&src.data))[i]); - } - } -} - -template <size_t vec_size> -FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e4m3, vec_size> &src, - vec_t<half, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = float(src.data); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 2; ++i) { - ((half2 *)(&dst.data))[i] = half2(((__nv_fp8x2_e4m3 *)(&src.data))[i]); - } - } -} - -template <size_t vec_size> -FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src, - vec_t<__nv_fp8_e4m3, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = __nv_fp8_e4m3(src.data); - } else if constexpr (vec_size == 2) { - *(__nv_fp8x2_e4m3 *)(&dst.data) = __nv_fp8x2_e4m3(*(float2 *)(&src.data)); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - ((__nv_fp8x4_e4m3 *)(&dst.data))[i] = - __nv_fp8x4_e4m3(((float4 *)(&src.data))[i]); - } - } -} - -template <size_t vec_size> -FLASHINFER_INLINE void cast_from_impl(const vec_t<half, vec_size> &src, - vec_t<__nv_fp8_e4m3, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = __nv_fp8_e4m3(src.data); - } else if constexpr (vec_size == 2) { - *(__nv_fp8x2_e4m3 *)(&dst.data) = __nv_fp8x2_e4m3(*(half2 *)(&src.data)); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - // NOTE(Zihao): need to double check if we properly handle flo and fhi - ((__nv_fp8x4_e4m3 *)(&dst.data))[i] = __nv_fp8x4_e4m3( - ((half2 *)(&src.data))[i * 2], ((half2 *)(&src.data))[i * 2 + 1]); - } - } -} - -template <size_t vec_size> -FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e5m2, vec_size> &src, - vec_t<float, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = float(src.data); - } else if constexpr (vec_size == 2) { - *(float2 *)(&dst.data) = float2(*(__nv_fp8x2_e5m2 *)(&src.data)); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - ((float4 *)(&dst.data))[i] = float4(((__nv_fp8x4_e5m2 *)(&src.data))[i]); - } - } -} - -template <size_t vec_size> -FLASHINFER_INLINE void cast_from_impl(const vec_t<__nv_fp8_e5m2, vec_size> &src, - vec_t<half, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = float(src.data); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 2; ++i) { - ((half2 *)(&dst.data))[i] = half2(((__nv_fp8x2_e5m2 *)(&src.data))[i]); - } - } -} - -template <size_t vec_size> -FLASHINFER_INLINE void cast_from_impl(const vec_t<float, vec_size> &src, - vec_t<__nv_fp8_e5m2, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = __nv_fp8_e5m2(src.data); - } else if constexpr (vec_size == 2) { - *(__nv_fp8x2_e5m2 *)(&dst.data) = __nv_fp8x2_e5m2(*(float2 *)(&src.data)); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - ((__nv_fp8x4_e5m2 *)(&dst.data))[i] = - __nv_fp8x4_e5m2(((float4 *)(&src.data))[i]); - } - } -} - -template <size_t vec_size> -FLASHINFER_INLINE void cast_from_impl(const vec_t<half, vec_size> &src, - vec_t<__nv_fp8_e5m2, vec_size> &dst) { - if constexpr (vec_size == 1) { - dst.data = __nv_fp8_e4m3(src.data); - } else if constexpr (vec_size == 2) { - *(__nv_fp8x2_e5m2 *)(&dst.data) = __nv_fp8x2_e5m2(*(half2 *)(&src.data)); - } else { -#pragma unroll - for (size_t i = 0; i < vec_size / 4; ++i) { - // NOTE(Zihao): need to double check if we properly handle flo and fhi - ((__nv_fp8x4_e5m2 *)(&dst.data))[i] = __nv_fp8x4_e5m2( - ((half2 *)(&src.data))[i * 2], ((half2 *)(&src.data))[i * 2 + 1]); - } - } -} - -#endif // FLASHINFER_USE_FP8 - -#endif // VEC_DTYPES_CUH_ diff --git a/csrc/punica/punica_ops.cc b/csrc/punica/punica_ops.cc deleted file mode 100644 index 4ad46e5e1f7263766897c4a7ed75167842c9b0bf..0000000000000000000000000000000000000000 --- a/csrc/punica/punica_ops.cc +++ /dev/null @@ -1,563 +0,0 @@ -#include <cuda_bf16.h> -#include <cuda_fp16.h> -#include <torch/extension.h> - -#include <cstdint> - -#include "bgmv/bgmv_config.h" - -namespace { - -//====== utils ====== - -inline void check_shape(const torch::Tensor &a, const torch::Tensor &b, - const char *a_name, const char *b_name) { - TORCH_CHECK(a.dim() == b.dim(), a_name, ".dim() != ", b_name, ".dim(). ", - a.dim(), " vs ", b.dim()); - for (int i = 0; i < a.dim(); ++i) { - TORCH_CHECK(a.size(i) == b.size(i), a_name, ".size(", i, ") != ", b_name, - ".size(", i, ")"); - } -} - -inline constexpr uint32_t pack_u16(uint16_t a, uint16_t b) { - return (uint32_t(a) << 16) | uint32_t(b); -} - -#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") - -#define CHECK_CONTIGUOUS(x) \ - TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") - -#define CHECK_INPUT(x) \ - CHECK_CUDA(x); \ - CHECK_CONTIGUOUS(x) - -#define CHECK_DIM(d, x) \ - TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor") - -#define CHECK_SHAPE(a, b) check_shape(a, b, #a, #b) - -#define CHECK_EQ(a, b) \ - TORCH_CHECK(a == b, "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b) - -//====== bgmv ====== - -template <typename in_T, typename out_T, typename W_T> -inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W, - const int64_t *lora_indices, - uint16_t in_features, uint16_t out_features, - int64_t y_offset, int64_t full_y_size, - int64_t batch_size, int64_t num_layers, - int64_t layer_idx, float scale) { - switch (pack_u16(in_features, out_features)) { -#define CASE_ONESIDE(_in_T, _out_T, _W_T, feat_in, feat_out) \ - case pack_u16(feat_in, feat_out): \ - bgmv_kernel<feat_in, feat_out>(Y, X, W, lora_indices, y_offset, \ - full_y_size, batch_size, num_layers, \ - layer_idx, scale); \ - break; -#define CASE(_in_T, _out_T, _W_T, narrow, wide) \ - CASE_ONESIDE(in_T, out_T, W_T, narrow, wide) \ - CASE_ONESIDE(in_T, out_T, W_T, wide, narrow) - - FOR_BGMV_WIDE_NARROW(CASE, _, _, _) -#undef CASE -#undef CASE_ONESIDE - default: - return false; - } - - return true; -} - -void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w, - torch::Tensor indicies, int64_t layer_idx, float scale) { - CHECK_INPUT(y); - CHECK_INPUT(x); - CHECK_INPUT(w); - CHECK_INPUT(indicies); - - CHECK_DIM(2, y); - CHECK_DIM(2, x); - CHECK_DIM(4, w); - CHECK_DIM(1, indicies); - - int64_t B = x.size(0); - int64_t h_in = x.size(1); - int64_t h_out = y.size(1); - int64_t num_layers = w.size(1); - CHECK_EQ(w.size(3), h_in); - CHECK_EQ(w.size(2), h_out); - CHECK_EQ(indicies.size(0), x.size(0)); - CHECK_EQ(y.size(0), x.size(0)); - bool ok = false; - if (h_in < 65536 && h_out < 65536) { - // TODO: See if we can get rid of this massive nested switch - switch (x.scalar_type()) { - case at::ScalarType::Half: - switch (y.scalar_type()) { - case at::ScalarType::Half: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()), - static_cast<nv_half *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()), - static_cast<nv_half *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()), - static_cast<nv_half *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()), - static_cast<nv_half *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()), - static_cast<nv_half *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()), - static_cast<nv_half *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (y.scalar_type()) { - case at::ScalarType::Half: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()), - static_cast<nv_bfloat16 *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()), - static_cast<nv_bfloat16 *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()), - static_cast<nv_bfloat16 *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()), - static_cast<nv_bfloat16 *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()), - static_cast<nv_bfloat16 *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()), - static_cast<nv_bfloat16 *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (y.scalar_type()) { - case at::ScalarType::Half: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()), - static_cast<float *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()), - static_cast<float *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()), - static_cast<float *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()), - static_cast<float *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()), - static_cast<float *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()), - static_cast<float *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, 0, - h_out, B, num_layers, layer_idx, scale); - break; - default: - break; - } - break; - default: - break; - } - break; - default: - break; - } - } - TORCH_CHECK(ok, "No suitable kernel.", " h_in=", h_in, " h_out=", h_out, - " dtype=", x.scalar_type(), " out_dtype=", y.scalar_type()); -} - -void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w, - torch::Tensor indicies, int64_t layer_idx, - float scale, int64_t h_in, int64_t h_out, - int64_t y_offset) { - CHECK_INPUT(y); - CHECK_INPUT(x); - CHECK_INPUT(w); - CHECK_INPUT(indicies); - - CHECK_DIM(2, y); - CHECK_DIM(2, x); - CHECK_DIM(4, w); - CHECK_DIM(1, indicies); - - int64_t B = x.size(0); - int64_t num_layers = w.size(1); - int64_t full_y_size = y.size(1); - CHECK_EQ(w.size(3), h_in); - CHECK_EQ(w.size(2), h_out); - CHECK_EQ(indicies.size(0), x.size(0)); - CHECK_EQ(y.size(0), x.size(0)); - bool ok = false; - if (h_in < 65536 && h_out < 65536) { - // TODO: See if we can get rid of this massive nested switch - switch (x.scalar_type()) { - case at::ScalarType::Half: - switch (y.scalar_type()) { - case at::ScalarType::Half: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()), - static_cast<nv_half *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()), - static_cast<nv_half *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()), - static_cast<nv_half *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()), - static_cast<nv_half *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()), - static_cast<nv_half *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()), - static_cast<nv_half *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (y.scalar_type()) { - case at::ScalarType::Half: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()), - static_cast<nv_bfloat16 *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()), - static_cast<nv_bfloat16 *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()), - static_cast<nv_bfloat16 *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()), - static_cast<nv_bfloat16 *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()), - static_cast<nv_bfloat16 *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()), - static_cast<nv_bfloat16 *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (y.scalar_type()) { - case at::ScalarType::Half: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()), - static_cast<float *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<nv_half *>(y.data_ptr()), - static_cast<float *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::BFloat16: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()), - static_cast<float *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<nv_bfloat16 *>(y.data_ptr()), - static_cast<float *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - case at::ScalarType::Float: - switch (w.scalar_type()) { - case at::ScalarType::Half: - ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()), - static_cast<float *>(x.data_ptr()), - static_cast<nv_half *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - case at::ScalarType::BFloat16: - ok = launch_bgmv_kernel(static_cast<float *>(y.data_ptr()), - static_cast<float *>(x.data_ptr()), - static_cast<nv_bfloat16 *>(w.data_ptr()), - indicies.data_ptr<int64_t>(), h_in, h_out, - y_offset, full_y_size, B, num_layers, - layer_idx, scale); - break; - default: - break; - } - break; - default: - break; - } - break; - default: - break; - } - } - TORCH_CHECK(ok, "No suitable kernel.", " h_in=", h_in, " h_out=", h_out, - " dtype=", x.scalar_type(), " out_dtype=", y.scalar_type()); -} - -} // namespace - -//====== pybind ====== - -#define DEFINE_pybind(name) m.def(#name, &name, #name); - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("dispatch_bgmv", &dispatch_bgmv, "dispatch_bgmv"); - m.def("dispatch_bgmv_low_level", &dispatch_bgmv_low_level, - "dispatch_bgmv_low_level"); -} diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp deleted file mode 100644 index 8a8235691ab8e8691ec099d424487d5e373debc1..0000000000000000000000000000000000000000 --- a/csrc/pybind.cpp +++ /dev/null @@ -1,115 +0,0 @@ -#include "cache.h" -#include "cuda_utils.h" -#include "ops.h" -#include <torch/extension.h> - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - // vLLM custom ops - pybind11::module ops = m.def_submodule("ops", "vLLM custom operators"); - - // Attention ops - ops.def( - "paged_attention_v1", - &paged_attention_v1, - "Compute the attention between an input query and the cached keys/values using PagedAttention."); - ops.def( - "paged_attention_v2", - &paged_attention_v2, - "PagedAttention V2."); - - // Activation ops - ops.def( - "silu_and_mul", - &silu_and_mul, - "Activation function used in SwiGLU."); - ops.def( - "gelu_new", - &gelu_new, - "GELU implementation used in GPT-2."); - ops.def( - "gelu_fast", - &gelu_fast, - "Approximate GELU implementation."); - - // Layernorm - ops.def( - "rms_norm", - &rms_norm, - "Apply Root Mean Square (RMS) Normalization to the input tensor."); - - ops.def( - "fused_add_rms_norm", - &fused_add_rms_norm, - "In-place fused Add and RMS Normalization"); - - // Rotary embedding - ops.def( - "rotary_embedding", - &rotary_embedding, - "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); - -#ifndef USE_ROCM - // Quantization ops - ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ"); - ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ"); -#endif - ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ"); - ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ"); - ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM"); - ops.def( - "moe_align_block_size", - &moe_align_block_size, - "Aligning the number of tokens to be processed by each expert such that it is divisible by the block size."); - - // Cache ops - pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops"); - cache_ops.def( - "swap_blocks", - &swap_blocks, - "Swap in (out) the cache blocks from src to dst"); - cache_ops.def( - "copy_blocks", - ©_blocks, - "Copy the cache blocks from src to dst"); - cache_ops.def( - "reshape_and_cache", - &reshape_and_cache, - "Reshape the key and value tensors and cache them"); - cache_ops.def( - "gather_cached_kv", - &gather_cached_kv, - "Gather key and value from the cache into contiguous QKV tensors"); - cache_ops.def( - "convert_fp8_e5m2", - &convert_fp8_e5m2, - "Convert the key and value cache to fp8_e5m2 data type"); - - // Cuda utils - pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils"); - cuda_utils.def( - "get_device_attribute", - &get_device_attribute, - "Gets the specified device attribute."); - - cuda_utils.def( - "get_max_shared_memory_per_block_device_attribute", - &get_max_shared_memory_per_block_device_attribute, - "Gets the maximum shared memory per block device attribute."); - -#ifndef USE_ROCM - // Custom all-reduce kernels - pybind11::module custom_ar = m.def_submodule("custom_ar", "custom allreduce"); - custom_ar.def("init_custom_ar", &init_custom_ar, "init_custom_ar"); - custom_ar.def("should_custom_ar", &should_custom_ar, "should_custom_ar"); - custom_ar.def("all_reduce_reg", &all_reduce_reg, "all_reduce_reg"); - custom_ar.def("all_reduce_unreg", &all_reduce_unreg, "all_reduce_unreg"); - custom_ar.def("dispose", &dispose, "dispose"); - custom_ar.def("meta_size", &meta_size, "meta_size"); - custom_ar.def("register_buffer", ®ister_buffer, "register_buffer"); - custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta, - "get_graph_buffer_ipc_meta"); - custom_ar.def("register_graph_buffers", ®ister_graph_buffers, - "register_graph_buffers"); -#endif - -} diff --git a/csrc/quantization/awq/dequantize.cuh b/csrc/quantization/awq/dequantize.cuh deleted file mode 100644 index d1d926de18d78a577a767c685f9ead9481269a49..0000000000000000000000000000000000000000 --- a/csrc/quantization/awq/dequantize.cuh +++ /dev/null @@ -1,87 +0,0 @@ -/* -Adapted from https://github.com/mit-han-lab/llm-awq -Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h -@article{lin2023awq, - title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration}, - author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song}, - journal={arXiv}, - year={2023} -} -*/ - -#pragma once - -namespace vllm { -namespace awq { - -__device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) -{ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750 - assert(false); -#else - uint4 result; - - uint32_t* h = reinterpret_cast<uint32_t*>(&result); - uint32_t const i4s = reinterpret_cast<uint32_t const&>(source); - - // First, we extract the i4s and construct an intermediate fp16 number. - static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa; - static constexpr uint32_t BOTTOM_MASK = 0x000f000f; - static constexpr uint32_t TOP_MASK = 0x00f000f0; - static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400; - - // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing - // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions. - // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and - // elt_67 to fp16 without having to shift them to the bottom bits before hand. - - // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue - // immediately before required. - const uint32_t top_i4s = i4s >> 8; - // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400 - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(h[0]) - : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); - // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400 - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(h[1]) - : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); - // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400 - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(h[2]) - : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); - // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400 - asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" - : "=r"(h[3]) - : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); - - // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the - // half2 ctor. In this case, I chose performance reliability over code readability. - - // This is the half2 {1032, 1032} represented as an integer. - // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408; - // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7] - static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400; - // This is the half2 {1 / 16, 1 / 16} represented as an integer. - static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00; - // This is the half2 {-72, -72} represented as an integer. - // static constexpr uint32_t NEG_72 = 0xd480d480; - // Haotian: Let's use {-64, -64}. - static constexpr uint32_t NEG_64 = 0xd400d400; - - // Finally, we construct the output numbers. - // Convert elt_01 - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM)); - // Convert elt_23 - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); - // Convert elt_45 - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM)); - // Convert elt_67 - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); - - return result; -#endif -} - -} // namespace awq -} // namespace vllm diff --git a/csrc/quantization/awq/gemm_kernels.cu b/csrc/quantization/awq/gemm_kernels.cu deleted file mode 100644 index 376c8ebfb9b7ae6368bd4d528e68072e5d120b33..0000000000000000000000000000000000000000 --- a/csrc/quantization/awq/gemm_kernels.cu +++ /dev/null @@ -1,668 +0,0 @@ -/* -Adapted from https://github.com/mit-han-lab/llm-awq -@article{lin2023awq, - title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration}, - author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song}, - journal={arXiv}, - year={2023} -} - */ - - -#include <torch/extension.h> -#include <c10/cuda/CUDAGuard.h> - -#include "dequantize.cuh" - -#include <cuda_fp16.h> - -namespace vllm { -namespace awq { - -// Pack two half values. -static inline __device__ __host__ unsigned -__pack_half2(const half x, const half y) { - unsigned v0 = *((unsigned short *)&x); - unsigned v1 = *((unsigned short *)&y); - return (v1 << 16) | v0; -} - -__global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n128k32(int G, int split_k_iters, half* __restrict__ A, int* __restrict__ B, half* __restrict__ scaling_factors, int* __restrict__ zeros, int M, int IC, int OC, half* __restrict__ C) -{ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750 - assert(false); -#else - static constexpr uint32_t ZERO = 0x0; - float C_warp[32]; - __shared__ half A_shared[16 * (32 + 8)]; - __shared__ half B_shared[32 * (128 + 8)]; - - __shared__ half scaling_factors_shared[128]; - __shared__ half zeros_shared[128]; - - int j_factors1 = ((OC + 128 - 1) / 128); - int blockIdx_x = 0; - int blockIdx_y = blockIdx.x % ((M + 16 - 1) / 16 * j_factors1); - int blockIdx_z = blockIdx.x / ((M + 16 - 1) / 16 * j_factors1); - - half A_shared_warp[8]; - half B_shared_warp[32]; - for (int j_0_4_init = 0; j_0_4_init < 4; ++j_0_4_init) { - for (int i = 0; i < 8; ++i) { - C_warp[(j_0_4_init * 8) + i] = 0.0; - } - } - - static constexpr int row_stride_warp = 32 * 8 / 32; - static constexpr int row_stride = 2 * 32 * 8 / 128; - bool ld_zero_flag = (threadIdx.y * 32 + threadIdx.x) * 8 < 128; - // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16 - bool ld_A_flag = (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp + threadIdx.x * 8 / 32) < M; // threadIdx.y is warp_id - // bool wb_C_flag = (threadIdx.x / 4) < M; - - half* A_ptr = A - + (((int)blockIdx_y) / j_factors1 * 16 + (((int)threadIdx.y) * row_stride_warp) + ((int)threadIdx.x) / (32 / 8)) * IC - + (((int)threadIdx.x) % (32 / 8)) * 8; - - int* B_ptr = B - + ((int)threadIdx.y) * (OC / 8) * 2 - + (((int)threadIdx.x) / (128 / 8)) * (OC / 8) - + (((int)blockIdx_y) % j_factors1) * (128 / 8) - + (((int)threadIdx.x) % (128 / 8)) * 1; -// Why * 1 in the above line? - - half* A_shared_ptr = A_shared - + ((int)threadIdx.y) * row_stride_warp * (32 + 8) - + (((int)threadIdx.x) / (32 / 8)) * (32 + 8) - + (((int)threadIdx.x) % (32 / 8) ) * 8; - - half* B_shared_ptr = B_shared - + ((int)threadIdx.y) * (row_stride / 2) * (128 + 8) - + (((int)threadIdx.x) / (128 / 8)) * (128 + 8) - + (((int)threadIdx.x) % (128 / 8)) * 8; - - int* zeros_ptr = zeros - + (((int)blockIdx_y) % j_factors1) * (128 / 8) - + ((int)threadIdx.x) % (128 / 8); - - half* scaling_factors_ptr = scaling_factors - + (((int)blockIdx_y) % j_factors1) * (128) - + (((int)threadIdx.x) % (128 / 8)) * 8; - - half* C_ptr = C - + static_cast<long long>(blockIdx_z) * M * OC // blockIdz.x -> split_k dim - + (((int)blockIdx_y) % j_factors1) * 128 - + ((int)threadIdx.y) * 64 - + (((int)threadIdx.x) % 4) * 2; - - // preload s.f. and zeros - int k_bound = (IC / 32 + split_k_iters - 1) / split_k_iters; - if ((k_bound - 1) * split_k_iters * 32 + blockIdx_z * 32 >= IC) k_bound -= 1; - for (int _k_0_0 = 0; _k_0_0 < k_bound; ++_k_0_0) { - int k_0_0 = _k_0_0 * split_k_iters + blockIdx_z; - __syncthreads(); - // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16 - if (ld_A_flag) - { - *(uint4*)(A_shared_ptr) = *(uint4*)(A_ptr + (k_0_0 * 32)); - } - else - { - *(uint4*)(A_shared_ptr) = make_uint4(0, 0, 0, 0); - } - - // for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 2; ++ax0_ax1_fused_0) { - uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr + k_0_0 * 32 / G * (OC / 8)); - uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded); - uint4 B_loaded_scale = *(uint4*)(scaling_factors_ptr + k_0_0 * 32 / G * (OC)); - /* - if (blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 0 && threadIdx.y == 0){ - printf("%x %x %x %x %x %x %x %x\n", B_loaded_scale.x, B_loaded_scale.y, B_loaded_scale.z, B_loaded_scale.w, B_loaded_zero.x, B_loaded_zero.y, B_loaded_zero.z, B_loaded_zero.w); - } - */ - // uint4 B_loaded_scale = make_uint4(0, 0, 0, 0); - int* B_ptr_local = B_ptr + k_0_0 * 32 * (OC / 8); - - for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) { - - // B: 32 x 136 (128+8) float16 - // each warp: 32 x 4 - // each thr: read 32 bit -> convert to 8xFP16 (a UINT4) -> scale and minus zero -> WB UINT4 - // *(uint4*)(B_shared + ((((ax0_ax1_fused_0 * 544) + (((int)threadIdx.y) * 272)) + ((((int)threadIdx.x) >> 4) * 136)) + ((((int)threadIdx.x) & 15) * 8))) = *(uint4*)(B + ((((((k_0_0 * 163840) + (ax0_ax1_fused_0 * 20480)) + (((int)threadIdx.y) * 10240)) + ((((int)threadIdx.x) >> 4) * 5120)) + (((int)blockIdx_y) * 128)) + ((((int)threadIdx.x) & 15) * 8))); - // row stride in shared memory: (NWARPS * 32 * 8 / cta_N) - uint32_t B_loaded = *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8)); - uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded); - //uint4 B_loaded_zero = *(uint4*)(zeros_shared + (threadIdx.x % (cta_N / 8)) * 8); - - // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x % (cta_N / 8)) * 8); - // - zero and * scale - // TODO (Haotian): can save 4 assembly instructions if sormulate as deq = q * scale - zero * scale. - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO)); - /* - if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 17 && threadIdx.y == 0){ - printf("[x] %X %X %X %X\n", B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w); - } - */ - - // write back - *(uint4*)(B_shared_ptr + ax0_ax1_fused_0 * row_stride * (128 + 8)) = B_loaded_fp16; - } - __syncthreads(); - - for (int k_0_1 = 0; k_0_1 < 2; ++k_0_1) { - { - unsigned int addr; - __asm__ __volatile__( - "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" - : "=r"(addr) - : "l"((void *)((&(A_shared[(k_0_1 * 16)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))) - ); - - - __asm__ __volatile__( - "ldmatrix.sync.aligned.m8n8.x4.shared.b16" - "{%0, %1, %2, %3}, [%4];\n" - : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[3]) - : "r"(addr) - ); - } - - for (int ax1_0 = 0; ax1_0 < 4; ++ax1_0) { - { - unsigned int addr; - __asm__ __volatile__( - "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" - : "=r"(addr) - : "l"((void *)((&(B_shared[(((k_0_1 * 2176) + (((int)threadIdx.y) * 64)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 136) + ((((int)threadIdx.x) >> 4) * 8)))) - ); - __asm__ __volatile__( - "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16" - "{%0, %1, %2, %3}, [%4];\n" - : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3]) - : "r"(addr) - ); - } - } - for (int j_0_4 = 0; j_0_4 < 4; ++j_0_4) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 - { - __asm__ __volatile__( - "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" - : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3])); - } - - { - __asm__ __volatile__( - "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" - : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])); - } - - { - __asm__ __volatile__( - "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" - : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3])); - } - - { - __asm__ __volatile__( - "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" - : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])); - } -#else - { - __asm__ __volatile__( - "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" - : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3])); - } - - { - __asm__ __volatile__( - "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" - : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])); - } - -#endif - } - } - } - -// TODO: Shang: Hoist loop invariance. - for (int ax1_0_1 = 0; ax1_0_1 < 4; ++ax1_0_1) { - for (int local_id = 0; local_id < 8; ++local_id) { - int row_offset = (((int)blockIdx_y) / j_factors1) * 16 + ((int)threadIdx.x) / 4 + (local_id % 4) / 2 * 8; - if (row_offset < M) - { - *(C_ptr + ax1_0_1 * 16 + row_offset * OC + (local_id / 4) * 8 + local_id % 2) = __float2half(C_warp[(ax1_0_1 * 8) + local_id]); - } - } - } -#endif -} - - -__global__ void __launch_bounds__(64) gemm_forward_4bit_cuda_m16n64k32(int G, int split_k_iters, half* __restrict__ A, int* __restrict__ B, half* __restrict__ scaling_factors, int* __restrict__ zeros, int M, int IC, int OC, half* __restrict__ C) -{ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750 - assert(false); -#else - static constexpr uint32_t ZERO = 0x0; - float C_warp[32]; - __shared__ half A_shared[16 * (32 + 8)]; - __shared__ half B_shared[32 * (64 + 8)]; - - __shared__ half scaling_factors_shared[64]; - __shared__ half zeros_shared[64]; - - int j_factors1 = ((OC + 64 - 1) / 64); - - int blockIdx_x = 0; - int blockIdx_y = blockIdx.x % ((M + 16 - 1) / 16 * j_factors1); - int blockIdx_z = blockIdx.x / ((M + 16 - 1) / 16 * j_factors1); - - half A_shared_warp[8]; - half B_shared_warp[16]; - for (int j_0_4_init = 0; j_0_4_init < 2; ++j_0_4_init) { - for (int i = 0; i < 8; ++i) { - C_warp[(j_0_4_init * 8) + i] = 0.0; - } - } - - static constexpr int row_stride_warp = 32 * 8 / 32; - static constexpr int row_stride = 2 * 32 * 8 / 64; - bool ld_zero_flag = (threadIdx.y * 32 + threadIdx.x) * 8 < 64; - // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16 - bool ld_A_flag = (blockIdx_y / j_factors1 * 16 + threadIdx.y * row_stride_warp + threadIdx.x * 8 / 32) < M; // threadIdx.y is warp_id - // bool wb_C_flag = (threadIdx.x / 4) < M; - - half* A_ptr = A - + (((int)blockIdx_y) / j_factors1 * 16 + (((int)threadIdx.y) * row_stride_warp) + ((int)threadIdx.x) / (32 / 8)) * IC - + (((int)threadIdx.x) % (32 / 8)) * 8; - - int* B_ptr = B - + ((int)threadIdx.y) * (OC / 8) * 4 - + (((int)threadIdx.x) / (64 / 8)) * (OC / 8) - + (((int)blockIdx_y) % j_factors1) * (64 / 8) - + (((int)threadIdx.x) % (64 / 8)) * 1; -// Why * 1 in the above line? - - half* A_shared_ptr = A_shared - + ((int)threadIdx.y) * row_stride_warp * (32 + 8) - + (((int)threadIdx.x) / (32 / 8)) * (32 + 8) - + (((int)threadIdx.x) % (32 / 8) ) * 8; - - half* B_shared_ptr = B_shared - + ((int)threadIdx.y) * (row_stride / 2) * (64 + 8) - + (((int)threadIdx.x) / (64 / 8)) * (64 + 8) - + (((int)threadIdx.x) % (64 / 8)) * 8; - - int* zeros_ptr = zeros - + (((int)blockIdx_y) % j_factors1) * (64 / 8) - + ((int)threadIdx.x) % (64 / 8); - - half* scaling_factors_ptr = scaling_factors - + (((int)blockIdx_y) % j_factors1) * (64) - + (((int)threadIdx.x) % (64 / 8)) * 8; - - half* C_ptr = C - + static_cast<long long>(blockIdx_z) * M * OC // blockIdz.x -> split_k dim - + (((int)blockIdx_y) % j_factors1) * 64 - + ((int)threadIdx.y) * 32 - + (((int)threadIdx.x) % 4) * 2; - - // preload s.f. and zeros - int k_bound = (IC / 32 + split_k_iters - 1) / split_k_iters; - if ((k_bound - 1) * split_k_iters * 32 + blockIdx_z * 32 >= IC) k_bound -= 1; - for (int _k_0_0 = 0; _k_0_0 < k_bound; ++_k_0_0) { - int k_0_0 = _k_0_0 * split_k_iters + blockIdx_z; - __syncthreads(); - // TODO: Haotian: blockIdx_y / j_factors1 in A loading to support bsz > 16 - if (ld_A_flag) - { - *(uint4*)(A_shared_ptr) = *(uint4*)(A_ptr + (k_0_0 * 32)); - } - else - { - *(uint4*)(A_shared_ptr) = make_uint4(0, 0, 0, 0); - } - - // for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 2; ++ax0_ax1_fused_0) { - uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr + k_0_0 * 32 / G * (OC / 8)); - uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded); - uint4 B_loaded_scale = *(uint4*)(scaling_factors_ptr + k_0_0 * 32 / G * (OC)); - /* - if (blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 0 && threadIdx.y == 0){ - printf("%x %x %x %x %x %x %x %x\n", B_loaded_scale.x, B_loaded_scale.y, B_loaded_scale.z, B_loaded_scale.w, B_loaded_zero.x, B_loaded_zero.y, B_loaded_zero.z, B_loaded_zero.w); - } - */ - // uint4 B_loaded_scale = make_uint4(0, 0, 0, 0); - int* B_ptr_local = B_ptr + k_0_0 * 32 * (OC / 8); - - for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) { - - // B: 32 x 136 (128+8) float16 - // each warp: 32 x 4 - // each thr: read 32 bit -> convert to 8xFP16 (a UINT4) -> scale and minus zero -> WB UINT4 - // *(uint4*)(B_shared + ((((ax0_ax1_fused_0 * 544) + (((int)threadIdx.y) * 272)) + ((((int)threadIdx.x) >> 4) * 136)) + ((((int)threadIdx.x) & 15) * 8))) = *(uint4*)(B + ((((((k_0_0 * 163840) + (ax0_ax1_fused_0 * 20480)) + (((int)threadIdx.y) * 10240)) + ((((int)threadIdx.x) >> 4) * 5120)) + (((int)blockIdx_y) * 128)) + ((((int)threadIdx.x) & 15) * 8))); - // row stride in shared memory: (NWARPS * 32 * 8 / cta_N) - uint32_t B_loaded = *(uint32_t*)(B_ptr_local + ax0_ax1_fused_0 * row_stride * (OC / 8)); - uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded); - //uint4 B_loaded_zero = *(uint4*)(zeros_shared + (threadIdx.x % (cta_N / 8)) * 8); - - // uint4 B_loaded_scale = *(uint4*)(scaling_factors_shared + (threadIdx.x % (cta_N / 8)) * 8); - // - zero and * scale - // TODO (Haotian): can save 4 assembly instructions if sormulate as deq = q * scale - zero * scale. - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO)); - /* - if (ax0_ax1_fused_0 == 0 && blockIdx_z == 0 && blockIdx_y == 0 && k_0_0 == 0 && threadIdx.x == 17 && threadIdx.y == 0){ - printf("[x] %X %X %X %X\n", B_loaded_fp16.x, B_loaded_fp16.y, B_loaded_fp16.z, B_loaded_fp16.w); - } - */ - - // write back - *(uint4*)(B_shared_ptr + ax0_ax1_fused_0 * row_stride * (64 + 8)) = B_loaded_fp16; - } - __syncthreads(); - - for (int k_0_1 = 0; k_0_1 < 2; ++k_0_1) - { - { - unsigned int addr; - __asm__ __volatile__( - "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" - : "=r"(addr) - : "l"((void *)((&(A_shared[(k_0_1 * 16)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))) - ); - __asm__ __volatile__( - "ldmatrix.sync.aligned.m8n8.x4.shared.b16" - "{%0, %1, %2, %3}, [%4];\n" - : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[3]) - : "r"(addr) - ); - } - - - for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) - { - { - unsigned int addr; - __asm__ __volatile__( - "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" - : "=r"(addr) - : "l"((void *)((&(B_shared[(((k_0_1 * 1152) + (((int)threadIdx.y) * 32)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8)))) - ); - __asm__ __volatile__( - "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16" - "{%0, %1, %2, %3}, [%4];\n" - : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3]) - : "r"(addr) - ); - } - } - - for (int j_0_4 = 0; j_0_4 < 2; ++j_0_4) - { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 750 - { - __asm__ __volatile__( - "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" - : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3])); - } - - { - __asm__ __volatile__( - "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" - : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])); - } - - { - __asm__ __volatile__( - "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" - : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3])); - } - - { - __asm__ __volatile__( - "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n" - : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])); - } -#else - { - __asm__ __volatile__( - "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" - : "=f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "=f"(((float *)(C_warp + (j_0_4 * 8)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[0]), "f"(((float *)(C_warp + (j_0_4 * 8)))[1]), "f"(((float *)(C_warp + (j_0_4 * 8)))[2]), "f"(((float *)(C_warp + (j_0_4 * 8)))[3])); - } - - { - __asm__ __volatile__( - "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32" - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n" - : "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3]) - : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((j_0_4 * 8) + 4)))[3])); - } -#endif - } - } - } - -// TODO: Shang: Hoist loop invariance. - for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) { - for (int local_id = 0; local_id < 8; ++local_id) { - int row_offset = (((int)blockIdx_y) / j_factors1) * 16 + ((int)threadIdx.x) / 4 + (local_id % 4) / 2 * 8; - if (row_offset < M) - { - *(C_ptr + ax1_0_1 * 16 + row_offset * OC + (local_id / 4) * 8 + local_id % 2) = __float2half(C_warp[(ax1_0_1 * 8) + local_id]); - } - } - } -#endif -} - -__global__ void __launch_bounds__(64) dequantize_weights( - int* __restrict__ B, - half* __restrict__ scaling_factors, - int* __restrict__ zeros, - half* __restrict__ C, - int G -) -{ - int j_factors1 = 4; - int row_stride2 = 4; - int split_k_iters = 1; - static constexpr uint32_t ZERO = 0x0; - half B_shared[32 * (128 + 8)]; - - half* B_shared_ptr2 = B_shared; - - half B_shared_warp[32]; - int OC = 512; - - int N = blockDim.x * gridDim.x; // 2 - int col = (blockIdx.x * blockDim.x + threadIdx.x); - int row = blockIdx.y * blockDim.y + threadIdx.y; - int index1 = 8 * col + 8 * row * N; - half* C_ptr2 = C + index1; - - int index2 = col + row * N; - int* B_ptr2 = B + index2; - - int index3 = col + (int)(row / G) * N; - int* zeros_ptr2 = zeros + index3; - int index4 = 8 * col + (int)(row / G) * N * 8; - half* scaling_factors_ptr2 = scaling_factors + index4; - - - uint32_t zeros_loaded = *(uint32_t*)(zeros_ptr2); - uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded); - uint4 B_loaded_scale = *(uint4*)(scaling_factors_ptr2); -int j=0; - - uint32_t B_loaded = *(uint32_t*)(B_ptr2 + j); - uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.x) : "r"(B_loaded_fp16.x), "r"(B_loaded_scale.x), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_zero.y)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.y) : "r"(B_loaded_fp16.y), "r"(B_loaded_scale.y), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_zero.z)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.z) : "r"(B_loaded_fp16.z), "r"(B_loaded_scale.z), "r"(ZERO)); - asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_zero.w)); - asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(B_loaded_fp16.w) : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO)); - - *(uint4*)(B_shared_ptr2 + j) = B_loaded_fp16; - - for (int i=0; i<8; ++i) { - *(C_ptr2 + i) = B_shared[i]; - } -} - -} // namespace awq -} // namespace vllm - -torch::Tensor awq_dequantize( - torch::Tensor _kernel, - torch::Tensor _scaling_factors, - torch::Tensor _zeros, - int split_k_iters, - int thx, - int thy) -{ - int in_c = _kernel.size(0); - int qout_c = _kernel.size(1); - int out_c = qout_c * 8; - int G = in_c / _scaling_factors.size(0); - - int x_thread = thx; - int y_thread = thy; - - int x_blocks = 1; - int y_blocks = 1; - if (thx==0) { - x_thread = qout_c; - } - if (thy==0) { - y_thread = in_c; - } - if (thx==0 && thy==0) { - x_thread = 8; - y_thread = 8; - x_blocks = (int)(qout_c / 8); - y_blocks = (int)(in_c / 8); - } - - const at::cuda::OptionalCUDAGuard device_guard(device_of(_scaling_factors)); - - auto options = torch::TensorOptions().dtype(_scaling_factors.dtype()).device(_scaling_factors.device()); - at::Tensor _de_kernel = torch::empty({in_c, out_c}, options); - - auto kernel = reinterpret_cast<int*>(_kernel.data_ptr<int>()); - auto de_kernel = reinterpret_cast<half*>(_de_kernel.data_ptr<at::Half>()); - auto scaling_factors = reinterpret_cast<half*>(_scaling_factors.data_ptr<at::Half>()); - auto zeros = reinterpret_cast<int*>(_zeros.data_ptr<int>()); - - dim3 num_blocks(x_blocks, y_blocks); - dim3 threads_per_block(x_thread, y_thread); - - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - vllm::awq::dequantize_weights<<<num_blocks, threads_per_block, 0, stream>>>( - kernel, scaling_factors, zeros, de_kernel, G); - - return _de_kernel; -} - -// in_feats: M, IC [float16] -// kernel: IC, OC // 8 [int32] -> cast to IC, OC [uint4b] -// scaling_factors: IC // G, OC [float16] -// zeros: IC // G, OC // 8 [int32] -> cast to IC // G, OC [uint4b] -// assume that batch_size < 16 for now - -torch::Tensor awq_gemm( - torch::Tensor _in_feats, - torch::Tensor _kernel, - torch::Tensor _scaling_factors, - torch::Tensor _zeros, - int split_k_iters) -{ - int num_in_feats = _in_feats.size(0); - int num_in_channels = _in_feats.size(1); - const at::cuda::OptionalCUDAGuard device_guard(device_of(_in_feats)); - - auto options = torch::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device()); - at::Tensor _out_feats = torch::empty({split_k_iters, num_in_feats, _kernel.size(1) * 8}, options); - int num_out_feats = _out_feats.size(-2); - int num_out_channels = _out_feats.size(-1); - - auto in_feats = reinterpret_cast<half*>(_in_feats.data_ptr<at::Half>()); - auto kernel = reinterpret_cast<int*>(_kernel.data_ptr<int>()); - auto out_feats = reinterpret_cast<half*>(_out_feats.data_ptr<at::Half>()); - auto scaling_factors = reinterpret_cast<half*>(_scaling_factors.data_ptr<at::Half>()); - auto zeros = reinterpret_cast<int*>(_zeros.data_ptr<int>()); - int group_size = num_in_channels / _scaling_factors.size(0); - - if (num_out_channels % 64 != 0) - throw std::invalid_argument("OC is not multiple of cta_N = 64"); - if (num_out_channels % 8 != 0) - throw std::invalid_argument("OC is not multiple of pack_num = 8"); - if (group_size % 32 != 0) - throw std::invalid_argument("Group size should be a multiple of 32"); - if (num_out_channels % group_size != 0) - throw std::invalid_argument("OC is not multiple of Group size"); - - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - if (num_out_channels % 128 == 0) - { - int j_factors1 = num_out_channels / 128 / 1; - dim3 num_blocks((num_out_feats + 16 - 1) / 16 * j_factors1 * split_k_iters); - // threadIdx.x: 32 - // threadIdx.y: i_factors[2] * j_factors[2] - dim3 threads_per_block(32, 2); - vllm::awq::gemm_forward_4bit_cuda_m16n128k32<<<num_blocks, threads_per_block, 0, stream>>>( - group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, num_in_feats, num_in_channels, num_out_channels, out_feats); - } - else if (num_out_channels % 64 == 0) - { - int j_factors1 = num_out_channels / 64 / 1; - dim3 num_blocks(1 * (num_out_feats + 16 - 1) / 16 * j_factors1 * split_k_iters); - - // threadIdx.x: 32 - // threadIdx.y: i_factors[2] * j_factors[2] - dim3 threads_per_block(32, 2); - vllm::awq::gemm_forward_4bit_cuda_m16n64k32<<<num_blocks, threads_per_block, 0, stream>>>( - group_size, split_k_iters, in_feats, kernel, scaling_factors, zeros, num_in_feats, num_in_channels, num_out_channels, out_feats); - } - return _out_feats.sum(0); -} diff --git a/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh b/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh deleted file mode 100644 index c3b0d311b89ccd9ab6eb9bce0b9d9e6295a919e0..0000000000000000000000000000000000000000 --- a/csrc/quantization/fp8_e5m2_kvcache/quant_utils.cuh +++ /dev/null @@ -1,278 +0,0 @@ -#pragma once - -#include <assert.h> -#include <stdint.h> -#include <float.h> -#include <type_traits> -#include "../../attention/attention_dtypes.h" -#include "../../attention/dtype_float32.cuh" -#include "../../attention/dtype_float16.cuh" -#include "../../attention/dtype_bfloat16.cuh" - -#pragma once - -namespace vllm { -#ifdef ENABLE_FP8_E5M2 -namespace fp8_e5m2_unscaled { - -template<typename Tout, typename Tin> -__inline__ __device__ Tout vec_conversion(const Tin& x) -{ - return x; -} - -// fp8 -> half -template<> -__inline__ __device__ uint16_t vec_conversion<uint16_t, uint8_t>(const uint8_t& a) -{ - __half_raw res = __nv_cvt_fp8_to_halfraw(a, __NV_E5M2); - return res.x; -} - -// fp8x2 -> half2 -template<> -__inline__ __device__ uint32_t vec_conversion<uint32_t, uint16_t>(const uint16_t& a) -{ - union { - uint16_t u16[2]; - uint32_t u32; - } tmp; - __half2_raw res = __nv_cvt_fp8x2_to_halfraw2(a, __NV_E5M2); - tmp.u16[0] = res.x; - tmp.u16[1] = res.y; - return tmp.u32; -} - -// fp8x4 -> half2x2 -template<> -__inline__ __device__ uint2 vec_conversion<uint2, uint32_t>(const uint32_t& a) -{ - union { - uint2 u32x2; - uint32_t u32[2]; - } tmp; - tmp.u32[0] = vec_conversion<uint32_t, uint16_t>((uint16_t)a); - tmp.u32[1] = vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U)); - return tmp.u32x2; -} - -// fp8x8 -> half2x4 -template<> -__inline__ __device__ uint4 vec_conversion<uint4, uint2>(const uint2& a) -{ - union { - uint4 u64x2; - uint2 u64[2]; - } tmp; - tmp.u64[0] = vec_conversion<uint2, uint32_t>(a.x); - tmp.u64[1] = vec_conversion<uint2, uint32_t>(a.y); - return tmp.u64x2; -} - -// fp8 -> __nv_bfloat16 -template<> -__inline__ __device__ __nv_bfloat16 vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a) -{ - // Note there is no direct convert function from fp8 to bf16. - // fp8 -> half - __half_raw res = __nv_cvt_fp8_to_halfraw(a, __NV_E5M2); - // half -> float -> bf16 - float tmp = half_to_float(res.x); - return __float2bfloat16(tmp); -} - -// fp8x2 -> __nv_bfloat162 -template<> -__inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a) -{ - __nv_bfloat162 res; - res.x = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a); - res.y = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U)); - return res; -} - -// fp8x4 -> bf16_4_t -template<> -__inline__ __device__ bf16_4_t vec_conversion<bf16_4_t, uint32_t>(const uint32_t& a) -{ - bf16_4_t res; - res.x = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a); - res.y = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U)); - return res; -} - -// fp8x8 -> bf16_8_t -template<> -__inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, uint2>(const uint2& a) -{ - bf16_4_t tmp1, tmp2; - tmp1 = vec_conversion<bf16_4_t, uint32_t>(a.x); - tmp2 = vec_conversion<bf16_4_t, uint32_t>(a.y); - bf16_8_t res; - res.x = tmp1.x; - res.y = tmp1.y; - res.z = tmp2.x; - res.w = tmp2.y; - return res; -} - -// fp8 -> float -template<> -__inline__ __device__ float vec_conversion<float, uint8_t>(const uint8_t& a) -{ - // fp8 -> half - uint16_t tmp = vec_conversion<uint16_t, uint8_t>(a); - // half -> float - return half_to_float(tmp); -} - -// fp8x2 -> float2 -template<> -__inline__ __device__ float2 vec_conversion<float2, uint16_t>(const uint16_t& a) -{ - // fp8x2 -> half2 - uint32_t tmp = vec_conversion<uint32_t, uint16_t>(a); - // half2 -> float2 - return half2_to_float2(tmp); -} - -// fp8x4 -> float4 -template<> -__inline__ __device__ Float4_ vec_conversion<Float4_, uint32_t>(const uint32_t& a) -{ - Float4_ res; - res.x = vec_conversion<float2, uint16_t>((uint16_t)a); - res.y = vec_conversion<float2, uint16_t>((uint16_t)(a >> 16U)); - return res; -} - -// fp8x8 -> float8 -template<> -__inline__ __device__ Float8_ vec_conversion<Float8_, uint2>(const uint2& a) -{ - Float4_ tmp1, tmp2; - tmp1 = vec_conversion<Float4_, uint32_t>(a.x); - tmp2 = vec_conversion<Float4_, uint32_t>(a.y); - Float8_ res; - res.x = tmp1.x; - res.y = tmp1.y; - res.z = tmp2.x; - res.w = tmp2.y; - return res; -} - - -// half -> fp8 -template<> -__inline__ __device__ uint8_t vec_conversion<uint8_t, uint16_t>(const uint16_t& a) -{ - __half_raw tmp; - tmp.x = a; - __nv_fp8_storage_t res = __nv_cvt_halfraw_to_fp8(tmp, __NV_SATFINITE, __NV_E5M2); - return (uint8_t)res; -} - -// bf16 -> fp8 -template<> -__inline__ __device__ uint8_t vec_conversion<uint8_t, __nv_bfloat16>(const __nv_bfloat16& a) -{ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 - assert(false); -#else - __nv_fp8_storage_t res = __nv_cvt_bfloat16raw_to_fp8(__nv_bfloat16_raw(a), __NV_SATFINITE, __NV_E5M2); - return (uint8_t)res; -#endif -} - -// float -> fp8 -template<> -__inline__ __device__ uint8_t vec_conversion<uint8_t, float>(const float& a) -{ - __nv_fp8_storage_t res = __nv_cvt_float_to_fp8(a, __NV_SATFINITE, __NV_E5M2); - return (uint8_t)res; -} - -// fp8x4 -> float4 -template<> -__inline__ __device__ float4 vec_conversion<float4, uint32_t>(const uint32_t& a) -{ - Float4_ tmp = vec_conversion<Float4_, uint32_t>(a); - float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y); - return res; -} - - -template<> -__inline__ __device__ uint32_t vec_conversion<uint32_t, float2>(const float2& a) -{ - union { - half2 float16; - uint32_t uint32; - }; - - float16 = __float22half2_rn(a); - return uint32; -} - -template<> -__inline__ __device__ uint2 vec_conversion<uint2, Float4_>(const Float4_& a) -{ - uint2 b; - float2 val; - val.x = a.x.x; - val.y = a.x.y; - b.x = vec_conversion<uint32_t, float2>(val); - - val.x = a.y.x; - val.y = a.y.y; - b.y = vec_conversion<uint32_t, float2>(val); - - return b; -} - -template<> -__inline__ __device__ float4 vec_conversion<float4, Float4_>(const Float4_& a) -{ - float4 b; - b.x = a.x.x; - b.y = a.x.y; - b.z = a.y.x; - b.w = a.y.y; - return b; -} - -template<> -__inline__ __device__ uint4 vec_conversion<uint4, Float8_>(const Float8_& a) -{ - uint4 b; - b.x = vec_conversion<uint32_t, float2>(a.x); - b.y = vec_conversion<uint32_t, float2>(a.y); - b.z = vec_conversion<uint32_t, float2>(a.z); - b.w = vec_conversion<uint32_t, float2>(a.w); - return b; -} - -template<> -__inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, float2>(const float2 &a) { - __nv_bfloat162 b; - from_float(b, a); - return b; -} - -template<> -__inline__ __device__ bf16_4_t vec_conversion<bf16_4_t, Float4_>(const Float4_ &a) { - bf16_4_t b; - from_float(b, a); - return b; -} - -template<> -__inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, Float8_>(const Float8_ &a) { - bf16_8_t b; - from_float(b, a); - return b; -} - -} // namespace fp8_e5m2_unscaled -#endif // ENABLE_FP8_E5M2 -} // namespace vllm diff --git a/csrc/quantization/gptq/compat.cuh b/csrc/quantization/gptq/compat.cuh deleted file mode 100644 index 4da0bc6e2df384200258117023b80abbea0b0b5d..0000000000000000000000000000000000000000 --- a/csrc/quantization/gptq/compat.cuh +++ /dev/null @@ -1,64 +0,0 @@ -/* -Copied from https://github.com/turboderp/exllamav2 -*/ - -#ifndef _compat_cuh -#define _compat_cuh - -namespace vllm { -namespace gptq { -// atomicAdd for half types, to support CC < 7.x - -__device__ __forceinline__ void atomicAdd_half(half* address, half val) -{ - unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); - unsigned int old = *address_as_ui; - unsigned int assumed; - - do - { - assumed = old; - __half_raw hsum; - hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); - half tmpres = __hadd(hsum, val); - hsum = __half_raw(tmpres); - old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; - old = atomicCAS(address_as_ui, assumed, old); - } - while (assumed != old); -} - -// atomicAdd for half2 types - -__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) -{ - unsigned int* address_as_ui = (unsigned int*)address; - unsigned int old = *address_as_ui; - unsigned int assumed; - do - { - assumed = old; - half2 old_val = *((half2*)&old); - half2 new_val = __hadd2(old_val, val); - old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val)); - } - while (assumed != old); -} - -// - -#if defined(__CUDA_ARCH__) || defined(USE_ROCM) -#if __CUDA_ARCH__ < 700 || defined(USE_ROCM) - -__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } - -#if __CUDA_ARCH__ < 600 || defined(USE_ROCM) -__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } -#endif - -#endif -#endif - -} // namespace gptq -} // namespace vllm -#endif diff --git a/csrc/quantization/gptq/matrix_view.cuh b/csrc/quantization/gptq/matrix_view.cuh deleted file mode 100644 index 1fdf019b29028ee406fd32c6e342b3598c49193c..0000000000000000000000000000000000000000 --- a/csrc/quantization/gptq/matrix_view.cuh +++ /dev/null @@ -1,151 +0,0 @@ -/* -Adapted from https://github.com/turboderp/exllamav2 and https://github.com/turboderp/exllama -*/ - -#ifndef _matrix_view_cuh -#define _matrix_view_cuh - -#include <cuda_runtime.h> -#include <cuda_fp16.h> - -#include "qdq_util.cuh" - -namespace vllm { -namespace gptq { - -class MatrixView_half -{ -public: - const half* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } - __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } - __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); } - __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; } - - __device__ __forceinline__ void item4(half (&items)[4], int row, int column) const - { - half2* ptr = (half2*) item_ptr(row, column); - half2 i01 = ptr[0]; - half2 i23 = ptr[1]; - items[0] = __low2half(i01); - items[1] = __high2half(i01); - items[2] = __low2half(i23); - items[3] = __high2half(i23); - } - __device__ __forceinline__ void item4_f(float (&items)[4], int row, int column) const - { - half2* ptr = (half2*)item_ptr(row, column); - half2 i01 = ptr[0]; - half2 i23 = ptr[1]; - items[0] = __half2float(__low2half(i01)); - items[1] = __half2float(__high2half(i01)); - items[2] = __half2float(__low2half(i23)); - items[3] = __half2float(__high2half(i23)); - } - - __device__ __forceinline__ void item4_h2(half2 (&items)[4], int row, int column) const - { - half2* ptr = (half2*)item_ptr(row, column); - half2 i01 = ptr[0]; - half2 i23 = ptr[1]; - items[0] = __half2half2(__low2half(i01)); - items[1] = __half2half2(__high2half(i01)); - items[2] = __half2half2(__low2half(i23)); - items[3] = __half2half2(__high2half(i23)); - } -}; - -class MatrixView_half_rw -{ -public: - half* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } - __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } - __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; } - __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; } - __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; } - - __device__ __forceinline__ void set4(int row, int column, half v0, half v1, half v2, half v3) - { - half2 v01 = __halves2half2(v0, v1); - half2 v23 = __halves2half2(v2, v3); - half2* ptr = (half2*) item_ptr(row, column); - ptr[0] = v01; - ptr[1] = v23; - } -}; - -class MatrixView_q4_row -{ -public: - const uint32_t* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ int item(int row, int column) const - { - int shift = (column & 0x07) * 4; - return (data[row * width / 8 + column / 8] >> shift) & 0x0f; - } - - __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const - { - int shift = (column & 0x07) * 4; - uint32_t d = data[row * width / 8 + column / 8] >> shift; - items[0] = d & 0x0f; - items[1] = (d >> 4) & 0x0f; - } - - __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const - { - int shift = (column & 0x07) * 4; - uint32_t d = data[row * width / 8 + column / 8] >> shift; - items[0] = d & 0x0f; - items[1] = (d >> 4) & 0x0f; - items[2] = (d >> 8) & 0x0f; - items[3] = (d >> 12) & 0x0f; - } -}; - -class MatrixView_q4_column -{ -public: - const uint32_t* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ int item(int row, int column) const - { - int shift = (row & 0x07) * 4; - return (data[row / 8 * width + column] >> shift) & 0x0f; - } - - __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; } - __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; } -}; - -} // namespace gptq -} // namespace vllm -#endif diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu deleted file mode 100644 index a5d2345f1e7fd286cc045dbb6d2f404fdb9cdd04..0000000000000000000000000000000000000000 --- a/csrc/quantization/gptq/q_gemm.cu +++ /dev/null @@ -1,875 +0,0 @@ -/* -Adapted from https://github.com/turboderp/exllamav2 and https://github.com/qwopqwop200/GPTQ-for-LLaMa -*/ - -#include <cstdint> -#include <cstdio> - -#include <torch/extension.h> -#include <c10/cuda/CUDAGuard.h> -#include <ATen/cuda/CUDAContext.h> -#include <cuda_runtime.h> -#include <cuda_fp16.h> - -#include "compat.cuh" -#include "matrix_view.cuh" -#include "qdq_4.cuh" - -namespace vllm { -namespace gptq { - -#define BLOCK_KN_SIZE 128 -#define BLOCK_M_SIZE_MAX 8 -#define MAX_GROUPS_IN_BLOCK (BLOCK_KN_SIZE / 32) -#define MAX_Q_GEMM_ROWS 50 -#define MAX_ALT_GEMM_ROWS 8 -#define THREADS_X 32 -#define THREADS_Y 32 -#define DIVIDE(x, size) (((x) + (size) - 1) / (size)) - -#if defined(USE_ROCM) -#include <hipblas/hipblas.h> -__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t handle, - hipblasOperation_t transA, - hipblasOperation_t transB, - int m, - int n, - int k, - const half* alpha, - const half* AP, - int lda, - const half* BP, - int ldb, - const half* beta, - half* CP, - int ldc) { - return hipblasHgemm(handle, transA, transB, m, n, k, - reinterpret_cast<const hipblasHalf *>(alpha), - reinterpret_cast<const hipblasHalf *>(AP), lda, - reinterpret_cast<const hipblasHalf *>(BP), ldb, - reinterpret_cast<const hipblasHalf *>(beta), - reinterpret_cast<hipblasHalf *>(CP), ldc); -} -#define hipblasHgemm __compat_hipblasHgemm - -// Previous version of PyTorch were converting to rocBLAS instead of hipBLAS. -#define rocblas_operation_none HIPBLAS_OP_N -#define rocblas_hgemm __compat_hipblasHgemm -#endif - -__forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); - return __hadd2(result, g_result); -} - -__forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); - return __half2float(__low2half(result)) + __half2float(__high2half(result)); -} - -typedef void (*fp_gemm_half_q_half_gptq_kernel) -( - const half*, - const uint32_t*, - const uint32_t*, - const half*, - half*, - const int, - const int, - const int, - const int, - const int* -); - -template <bool first_block, int m_count> -__global__ void gemm_half_q_half_gptq_kernel -( - const half* __restrict__ a, - const uint32_t* __restrict__ b_q_weight, - const uint32_t* __restrict__ b_gptq_qzeros, - const half* __restrict__ b_gptq_scales, - half* __restrict__ c, - const int size_m, - const int size_n, - const int size_k, - const int groups, - const int* __restrict__ b_q_perm -) -{ - MatrixView_half a_(a, size_m, size_k); - MatrixView_half_rw c_(c, size_m, size_n); - MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); - MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - - int t = threadIdx.x; - - // Block - int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; - int offset_m = blockIdx.y * m_count; - int offset_k = blockIdx.z * BLOCK_KN_SIZE; - - int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - int end_m = min(offset_m + m_count, size_m); - int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - - int n = offset_n + t * 4; - - // Preload block_a - __shared__ half block_a[m_count][BLOCK_KN_SIZE]; - - if (offset_k + t < end_k) - { - for (int m = 0; m < m_count; ++m) - { - const half* a_ptr = a_.item_ptr(offset_m + m, 0); - half* block_a_ptr = block_a[m]; - - half a0; - if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]]; - else a0 = a_ptr[offset_k + t]; - block_a_ptr[t] = a0; - } - } - - // Zero output - if (n >= size_n) return; - - if (blockIdx.z == 0) - { - for (int m = 0; m < m_count; m++) - *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0; - } - - __syncthreads(); - - // Find initial group - int groupsize = size_k / groups; - int group = offset_k / groupsize; - int nextgroup = offset_k + groupsize; - - // a, b offset - int qk = offset_k / (32 / 4); - - const uint32_t* b_ptr = b_q_weight + qk * size_n + n; - const half* a_ptr = &block_a[0][0]; - int a_stride = BLOCK_KN_SIZE; - - // Initial group - int zeros[4]; - float scales[4]; - half2 z1z16[4][2]; - half2 y1y16[4][2]; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_f(scales, group, n); - dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); - dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); - dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); - dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); - - // Column result - float block_c[m_count][4] = {}; - - // Dequantize and multiply - int k = offset_k; - while (k < end_k) - { - if (k == nextgroup) - { - group++; - nextgroup += groupsize; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_f(scales, group, n); - dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); - dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); - dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); - dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); - } - - #pragma unroll - for (int j = 0; j < 4; j++) - { - const int4* b_ptr4 = (int4*) b_ptr; - int4 load_int4 = *b_ptr4; - - half2 dq[4][4]; - dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false); - dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false); - dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false); - dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false); - - #pragma unroll - for (int m = 0; m < m_count; m++) - { - block_c[m][0] = fma(dot22_8_f(dq[0], a_ptr + m * a_stride), scales[0], block_c[m][0]); - block_c[m][1] = fma(dot22_8_f(dq[1], a_ptr + m * a_stride), scales[1], block_c[m][1]); - block_c[m][2] = fma(dot22_8_f(dq[2], a_ptr + m * a_stride), scales[2], block_c[m][2]); - block_c[m][3] = fma(dot22_8_f(dq[3], a_ptr + m * a_stride), scales[3], block_c[m][3]); - } - - b_ptr += size_n; - a_ptr += 8; - } - - k += 32; - } - - for (int m = 0; m < m_count; m++) - { - half2 *out = (half2*) c_.item_ptr(offset_m + m, n); - half2 result01 = __halves2half2(__float2half_rn(block_c[m][0]), __float2half_rn(block_c[m][1])); - half2 result23 = __halves2half2(__float2half_rn(block_c[m][2]), __float2half_rn(block_c[m][3])); - atomicAdd(out , result01); - atomicAdd(out + 1, result23); - } -} - - -fp_gemm_half_q_half_gptq_kernel pick_gemm_half_q_half_gptq_kernel(bool first_block, const int m_count) -{ - #if BLOCK_M_SIZE_MAX >= 1 - if (m_count == 1) return gemm_half_q_half_gptq_kernel<true, 1>; - #endif - #if BLOCK_M_SIZE_MAX >= 2 - if (m_count == 2) return gemm_half_q_half_gptq_kernel<true, 2>; - #endif - #if BLOCK_M_SIZE_MAX >= 3 - if (m_count == 3) return gemm_half_q_half_gptq_kernel<true, 3>; - #endif - #if BLOCK_M_SIZE_MAX >= 4 - if (m_count == 4) return gemm_half_q_half_gptq_kernel<true, 4>; - #endif - #if BLOCK_M_SIZE_MAX >= 5 - if (m_count == 5) return gemm_half_q_half_gptq_kernel<true, 5>; - #endif - #if BLOCK_M_SIZE_MAX >= 6 - if (m_count == 6) return gemm_half_q_half_gptq_kernel<true, 6>; - #endif - #if BLOCK_M_SIZE_MAX >= 7 - if (m_count == 7) return gemm_half_q_half_gptq_kernel<true, 7>; - #endif - #if BLOCK_M_SIZE_MAX >= 8 - if (m_count == 8) return gemm_half_q_half_gptq_kernel<true, 8>; - #endif - return NULL; -} - - -void gemm_half_q_half_cuda_part -( - const half* a, - const uint32_t* b_q_weight, - const uint32_t* b_gptq_qzeros, - const half* b_gptq_scales, - const int* b_q_perm, - half* c, - int size_m, - int size_n, - int size_k, - int m_count, - int groups -) -{ - dim3 blockDim, gridDim; - blockDim.x = BLOCK_KN_SIZE; - blockDim.y = 1; - blockDim.z = 1; - gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4); - gridDim.y = DIVIDE(size_m, m_count); - gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE); - - fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(true, m_count); - - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - kernel<<<gridDim, blockDim, 0, stream>>> - ( - a, - b_q_weight, - b_gptq_qzeros, - b_gptq_scales, - c, - size_m, - size_n, - size_k, - groups, - b_q_perm - ); -} - - -__global__ void reconstruct_exllama_kernel -( - const uint32_t* __restrict__ b_q_weight, - const int* __restrict__ b_q_perm, - const uint32_t* __restrict__ b_gptq_qzeros, - const half* __restrict__ b_gptq_scales, - const int size_k, - const int size_n, - const int groups, - half* __restrict__ b -) -{ - MatrixView_half_rw b_(b, size_k, size_n); - MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); - MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - - int offset_k = BLOCK_KN_SIZE * blockIdx.y; - int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; - - int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - - // Preload remapping table - __shared__ int perm[BLOCK_KN_SIZE]; - int t = threadIdx.x; - - if (b_q_perm) - { - if (offset_k + t < size_k) - perm[t] = b_q_perm[offset_k + t]; - } - - // Column - int n = offset_n + t * 4; - if (n >= size_n) return; - - // Find initial group - int groupsize = size_k / groups; - int group = offset_k / groupsize; - int nextgroup = offset_k + groupsize; - - // b offset - int qk = offset_k / (32 / 4); - - const uint32_t* b_ptr = b_q_weight + qk * size_n + n; - - // Initial zeros/scale - int zeros[4]; - half2 scales[4]; - half2 z1z16[4][2]; - half2 y1y16[4][2]; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_h2(scales, group, n); - dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); - dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); - dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); - dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); - - __syncthreads(); - - int k = offset_k; - int lk = 0; - - while (k < end_k) - { - if (k == nextgroup) - { - group++; - nextgroup += groupsize; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_h2(scales, group, n); - dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); - dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); - dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); - dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); - } - - for (int p = 0; p < 4; p++) - { - half2 dq[4][4]; - const int4* b_ptr4 = (int4*) b_ptr; - int4 load_int4 = *b_ptr4; - - dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false); - dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false); - dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false); - dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false); - - b_ptr += size_n; - //half* dqh = (half*)dq; - if (b_q_perm) - { - for (int j = 0; j < 4; j++) - { - for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); - b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); - b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); - } - } - else - { - for (int j = 0; j < 4; j++) - { - for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); - b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); - b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); - } - } - } - k += 32; - } -} - - -void reconstruct_exllama -( - const uint32_t* b_q_weight, - const uint32_t* b_gptq_qzeros, - const half* b_gptq_scales, - const int* b_q_perm, - half* out, - int height, - int width, - int groups -) -{ - dim3 blockDim, gridDim; - blockDim.x = BLOCK_KN_SIZE; - blockDim.y = 1; - gridDim.y = DIVIDE(height, BLOCK_KN_SIZE); - gridDim.x = DIVIDE(width, BLOCK_KN_SIZE); - - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - reconstruct_exllama_kernel<<<gridDim, blockDim, 0, stream>>> - ( - b_q_weight, - b_q_perm, - b_gptq_qzeros, - b_gptq_scales, - height, - width, - groups, - out - ); -} - - -__global__ void gemm_half_q_half_alt_kernel( - const half2* __restrict__ vec, - const uint32_t* __restrict__ mat, - half* __restrict__ mul, - const half* __restrict__ scales, - const uint32_t* __restrict__ zeros, - const int* __restrict__ g_idx, - int batch, - int height, - int width -) -{ - int zero_width = width / 8; - int vec_height = height * 4; - const int blockwidth2 = BLOCK_KN_SIZE / 2; - int b = blockIdx.y * BLOCK_M_SIZE_MAX; - int b_end = min(BLOCK_M_SIZE_MAX, batch - b); - int h = BLOCK_KN_SIZE * blockIdx.z / 8; - int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4; - int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; - - __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2]; - if (threadIdx.x < h_end) { - for (int m = 0; m < b_end; ++m) { - blockvec[m][threadIdx.x] = - vec[(m + b) * vec_height + blockIdx.z * BLOCK_KN_SIZE / 2 + - threadIdx.x]; - } - } - - __shared__ half2 deq2[256][8]; - int val = threadIdx.x / 8; - int off = threadIdx.x % 8; - for (; val < 256; val += BLOCK_KN_SIZE / 8) { - deq2[val][off] = __halves2half2( - __int2half_rn(val & 0xF), __int2half_rn(val >> 4) - ); - } - - if (blockIdx.z == 0) - { - for (int m = 0; m < b_end; m++) - mul[(b + m) * width + w] = __int2half_rn(0); - } - __syncthreads(); - - int i = width * h + w; - int g_h = h * 8; - int k = 0; - int z_w = w / 8; - int z_mod = (w % 8) * 4; - half2 res2; - half res[BLOCK_M_SIZE_MAX] = {}; - - unsigned int tmp; - while (k < h_end) { - tmp = mat[i]; - half2 scales_tmp[4]; - half2 zeros_tmp[4]; - for (int tmp_k = 0; tmp_k < 4; tmp_k++) { - int g = g_idx[g_h + (k + tmp_k) * 2]; - int g2 = g_idx[g_h + (k + tmp_k) * 2 + 1]; - half scale_f = scales[g * width + w]; - half scale_f2 = scales[g2 * width + w]; - half2 scale = __halves2half2(scale_f, scale_f2); - half2 zero = __halves2half2( - __hmul(scale_f, __int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xF) - 1)), - __hmul(scale_f2, __int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) - 1)) - ); - scales_tmp[tmp_k] = scale; - zeros_tmp[tmp_k] = zero; - } - for (int m = 0; m < b_end; m++) { -#ifndef USE_ROCM - res2 = {}; -#else - res2.x = __half_as_ushort(__float2half(0)); - res2.y = __half_as_ushort(__float2half(0)); -#endif - res2 = __hfma2(__hfma2(deq2[(tmp >> 0) & 0xff][off], scales_tmp[0], zeros_tmp[0]), blockvec[m][k + 0], res2); - res2 = __hfma2(__hfma2(deq2[(tmp >> 8) & 0xff][off], scales_tmp[1], zeros_tmp[1]), blockvec[m][k + 1], res2); - res2 = __hfma2(__hfma2(deq2[(tmp >> 16) & 0xff][off], scales_tmp[2], zeros_tmp[2]), blockvec[m][k + 2], res2); - res2 = __hfma2(__hfma2(deq2[(tmp >> 24) & 0xff][off], scales_tmp[3], zeros_tmp[3]), blockvec[m][k + 3], res2); -#ifndef USE_ROCM - res[m] = __hadd(res[m], __hadd(res2.x, res2.y)); -#else - res[m] = __hadd(res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y))); -#endif - } - i += width; - k += 4; - } - for (int m = 0; m < b_end; m++) { - atomicAdd(&mul[(b + m) * width + w], res[m]); - } -} - - -void gemm_half_q_half_alt -( - const half* a, - const uint32_t* b_q_weight, - const uint32_t* b_gptq_qzeros, - const half* b_gptq_scales, - const int* b_g_idx, - half* c, - int size_m, - int size_n, - int size_k -) -{ - dim3 blockDim, gridDim; - blockDim.x = BLOCK_KN_SIZE; - blockDim.y = 1; - blockDim.z = 1; - gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE); - gridDim.y = DIVIDE(size_m, BLOCK_M_SIZE_MAX); - gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE); - - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - gemm_half_q_half_alt_kernel<<<gridDim, blockDim, 0, stream>>> - ( - (const half2*) a, - b_q_weight, - c, - b_gptq_scales, - b_gptq_qzeros, - b_g_idx, - size_m, - size_k / 8, - size_n - ); -} - - -__global__ void reconstruct_gptq_kernel -( - const uint32_t* __restrict__ w, - const half* __restrict__ w_scales, - const uint32_t* __restrict__ w_zeros, - const int* __restrict__ g_idx, - const int height, - const int width, - const int group, - half* __restrict__ out -) -{ - // Start of block - - int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; - int row = blockIdx.y * 8; - if (column >= width) return; - - // Views - - MatrixView_q4_column w_(w, height, width); - MatrixView_half_rw out_(out, height, width); - MatrixView_half w_scales_(w_scales, group, width); - MatrixView_q4_row w_zeros_(w_zeros, group, width); - - uint32_t w_read = w_.item_uint32_t(row, column); - half* out_ptr = out_.item_ptr(row, column); - - #pragma unroll - for (int s = 0; s < 32; s += 4) - { - int group = g_idx[row + s / 4]; - half w_scale = w_scales_.item(group, column); - uint32_t w_zero = w_zeros_.item(group, column) + 1; - half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale); - *out_ptr = w_item; out_ptr += out_.width; - } -} - - -void reconstruct_gptq -( - const uint32_t* b_q_weight, - const uint32_t* b_gptq_qzeros, - const half* b_gptq_scales, - const int* b_g_idx, - half* out, - int height, - int width, - int groups -) -{ - dim3 blockDim, gridDim; - blockDim.x = BLOCK_KN_SIZE; - blockDim.y = 1; - gridDim.y = DIVIDE(height, 8); - gridDim.x = DIVIDE(width, BLOCK_KN_SIZE); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - reconstruct_gptq_kernel<<<gridDim, blockDim, 0, stream>>> - ( - b_q_weight, - b_gptq_scales, - b_gptq_qzeros, - b_g_idx, - height, - width, - groups, - out - ); -} - - -void gemm_half_q_half_cuda -( - cublasHandle_t cublas_handle, - const half* a, - const uint32_t* b_q_weight, - const uint32_t* b_gptq_qzeros, - const half* b_gptq_scales, - const int* b_g_idx, - half* c, - half* temp_dq, - int size_m, - int size_n, - int size_k, - int groups, - bool use_exllama -) -{ - if ((use_exllama && size_m > MAX_Q_GEMM_ROWS) || (!use_exllama && size_m > MAX_ALT_GEMM_ROWS)) { - // Reconstruct FP16 matrix, then cuBLAS - if (use_exllama) { - reconstruct_exllama(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, temp_dq, - size_k, size_n, groups); - } - else - { - reconstruct_gptq(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, - temp_dq, size_k, size_n, groups); - } - - const half alpha = __float2half(1.0f); - const half beta = __float2half(0.0f); - cublasHgemm(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - size_n, size_m, size_k, - &alpha, temp_dq, size_n, - a, size_k, - &beta, c, size_n); - } - else if (use_exllama) - { - // Quantized matmul - int max_chunks = size_m / BLOCK_M_SIZE_MAX; - int last_chunk = max_chunks * BLOCK_M_SIZE_MAX; - int last_chunk_size = size_m - last_chunk; - - if (max_chunks) - { - gemm_half_q_half_cuda_part(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, - c, last_chunk, size_n, size_k, BLOCK_M_SIZE_MAX, - groups); - } - - if (last_chunk_size) - { - gemm_half_q_half_cuda_part(a + last_chunk * size_k, b_q_weight, b_gptq_qzeros, - b_gptq_scales, b_g_idx, c + last_chunk * size_n, - last_chunk_size, size_n, size_k, last_chunk_size, - groups); - } - } - else - { - gemm_half_q_half_alt(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, - c, size_m, size_n, size_k); - } -} - - -__global__ void shuffle_kernel -( - uint32_t* __restrict__ b_q_weight, - const int size_k, - const int size_n -) -{ - int n = blockIdx.x * THREADS_X + threadIdx.x; - if (n >= size_n) return; - int k = 0; - uint32_t* b_ptr = b_q_weight + n; - while (k < size_k) { shuffle_4bit_8 (b_ptr, size_n); b_ptr += 1 * size_n; k += 8; } -} - - -__global__ void make_sequential_kernel -( - const uint32_t* __restrict__ w, - uint32_t* __restrict__ w_new, - const int* __restrict__ q_perm, - const int w_height, - const int w_width -) -{ - const uint64_t* w2 = (uint64_t*) w; - uint64_t* w_new2 = (uint64_t*) w_new; - int w2_stride = w_width >> 1; - int w2_column = THREADS_X * blockIdx.x + threadIdx.x; - if (w2_column >= w2_stride) return; - int w_new2_row = blockIdx.y; - int q_perm_idx = w_new2_row << 3; - uint64_t dst = 0; - - #pragma unroll - for (int i = 0; i < 8; i++) - { - int source_row = q_perm[q_perm_idx++]; - - int w2_row = source_row >> 3; - int w2_subrow = source_row & 0x07; - int w2_row_shift = w2_subrow << 2; - int wnew2_row_shift = i << 2; - - uint64_t src = w2[w2_row * w2_stride + w2_column]; - src >>= w2_row_shift; - src &= 0x0000000f0000000f; - src <<= wnew2_row_shift; - dst |= src; - } - w_new2[w_new2_row * w2_stride + w2_column] = dst; -} - - -void shuffle_exllama_weight -( - uint32_t* q_weight, - int* q_perm, - int height, - int width -) -{ - if (q_perm) - { - uint32_t* new_qweight = NULL; - cudaMalloc(&new_qweight, height / 8 * width * sizeof(uint32_t)); - - dim3 blockDim, gridDim; - blockDim.x = THREADS_X; - blockDim.y = 1; - gridDim.x = DIVIDE(width, THREADS_X); - gridDim.y = height / 8; - - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - make_sequential_kernel<<<gridDim, blockDim, 0, stream>>> - ( - q_weight, - new_qweight, - q_perm, - height / 8, - width - ); - // Replace qweights - cudaMemcpyAsync(q_weight, new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice); - // Cleanup - cudaDeviceSynchronize(); - cudaFree(new_qweight); - } - dim3 blockDim, gridDim; - blockDim.x = THREADS_X; - blockDim.y = 1; - gridDim.x = DIVIDE(width, THREADS_X); - gridDim.y = 1; - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - shuffle_kernel<<<gridDim, blockDim, 0, stream>>>(q_weight, height, width); -} - -} // namespace gptq -} // namespace vllm - -torch::Tensor gptq_gemm -( - torch::Tensor a, - torch::Tensor b_q_weight, - torch::Tensor b_gptq_qzeros, - torch::Tensor b_gptq_scales, - torch::Tensor b_g_idx, - bool use_exllama -) -{ - const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); - auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device()); - at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options); - at::Tensor temp_dq = torch::empty({b_q_weight.size(0) * 8, b_q_weight.size(1)}, options); - - vllm::gptq::gemm_half_q_half_cuda - ( - at::cuda::getCurrentCUDABlasHandle(), - (const half*) a.data_ptr(), - (const uint32_t*) b_q_weight.data_ptr(), - (const uint32_t*)b_gptq_qzeros.data_ptr(), - (const half*) b_gptq_scales.data_ptr(), - b_g_idx.device().is_meta() ? NULL : (const int*) b_g_idx.data_ptr(), - (half*) c.data_ptr(), - (half*) temp_dq.data_ptr(), - c.size(0), // m - c.size(1), // n - a.size(1), // k - b_gptq_qzeros.size(0), // group number - use_exllama - ); - return c; -} - -void gptq_shuffle -( - torch::Tensor q_weight, - torch::Tensor q_perm -) -{ - const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight)); - vllm::gptq::shuffle_exllama_weight( - (uint32_t*) q_weight.data_ptr(), - q_perm.device().is_meta() ? NULL : (int*) q_perm.data_ptr(), - q_weight.size(0) * 8, - q_weight.size(1) - ); -} diff --git a/csrc/quantization/gptq/qdq_4.cuh b/csrc/quantization/gptq/qdq_4.cuh deleted file mode 100644 index cfc4635a22c1d16c99a7cb867dcf1004f3b0fe4c..0000000000000000000000000000000000000000 --- a/csrc/quantization/gptq/qdq_4.cuh +++ /dev/null @@ -1,235 +0,0 @@ -/* -Copied from https://github.com/turboderp/exllamav2 -*/ - -#ifndef _qdq_4_cuh -#define _qdq_4_cuh - -#include "qdq_util.cuh" - -namespace vllm { -namespace gptq { -// Permutation: -// -// 77775555 33331111 66664444 22220000 - -__forceinline__ __device__ void shuffle_4bit_8 -( - uint32_t* q, - int stride -) -{ - uint32_t qa = q[0]; - uint32_t qb = 0; - - #pragma unroll - for (int i = 0; i < 4; i++) - { - uint32_t qa0 = qa & 0x0f; - uint32_t qa1 = (qa & 0xf0) >> 4; - qa >>= 8; - qb |= (qa1 << (i * 4 + 16)); - qb |= (qa0 << (i * 4)); - } - q[0] = qb; -} - -__forceinline__ __device__ void dequant_4bit_8 -( - const uint32_t q_0, - half2 (&dq)[4], - int stride -) -{ - const uint32_t c0 = 0x64006400; - const half y16_ = __float2half_rn(1.0f / 16.0f); - const half2 y16 = __halves2half2(y16_, y16_); - const half z1_ = __float2half_rn(-1024.0f - 8.0f); - const half z16_ = __float2half_rn(-1024.0f / 16.0f - 8.0f); - const half2 z1 = __halves2half2(z1_, z1_); - const half2 z16 = __halves2half2(z16_, z16_); - - uint32_t qa = q_0; - half2_uint32 q0((qa & 0x000f000f) | c0); // half2(q[ 0], q[ 1]) + 1024 - half2_uint32 q1((qa & 0x00f000f0) | c0); // half2(q[ 2], q[ 3]) * 16 + 1024 - qa >>= 8; - half2_uint32 q2((qa & 0x000f000f) | c0); // half2(q[ 4], q[ 5]) + 1024 - half2_uint32 q3((qa & 0x00f000f0) | c0); // half2(q[ 6], q[ 7]) * 16 + 1024 - - dq[0] = __hadd2(q0.as_half2, z1); - dq[1] = __hfma2(q1.as_half2, y16, z16); - dq[2] = __hadd2(q2.as_half2, z1); - dq[3] = __hfma2(q3.as_half2, y16, z16); -} - -__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale -( - const uint32_t zero, - const half scale, - half2 (&z1z16)[2], - half2 (&y1y16)[2] -) -{ - half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero); - half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero)); - - half2 scale2 = __half2half2(scale); - - z1z16[0] = __hmul2(scale2, __half2half2(z1.as_half)); - z1z16[1] = __hmul2(scale2, __half2half2(z16)); - - const half y1 = __float2half_rn(1.0f); - const half y16 = __float2half_rn(1.0f / 16.0f); - - y1y16[0] = __hmul2(scale2, __half2half2(y1)); - y1y16[1] = __hmul2(scale2, __half2half2(y16)); -} - -__forceinline__ __device__ void dequant_4bit_8_prep_zero -( - const uint32_t zero, - half2(&z1z16)[2], - half2(&y1y16)[2] -) -{ - half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero); - half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero)); - - z1z16[0] = __half2half2(z1.as_half); - z1z16[1] = __half2half2(z16); - - const half y1 = __float2half_rn(1.0f); - const half y16 = __float2half_rn(1.0f / 16.0f); - - y1y16[0] = __half2half2(y1); - y1y16[1] = __half2half2(y16); -} - - -__forceinline__ __device__ void dequant_4bit_8_gptq -( - const uint32_t q_0, - half2 (&dq)[4], - half2 (&z1z16)[2], - half2 (&y1y16)[2], - int stride, - bool scaled -) -{ - const uint32_t c0 = 0x64006400; - - uint32_t qa = q_0; - half2_uint32 q0((qa & 0x000f000f) | c0); // half2( q[0] + 1024, q[1] + 1024 ) - half2_uint32 q1((qa & 0x00f000f0) | c0); // half2( q[2] * 16 + 1024, q[3] * 16 + 1024 ) - qa >>= 8; - half2_uint32 q2((qa & 0x000f000f) | c0); // half2( q[4] + 1024, q[5] + 1024 ) - half2_uint32 q3((qa & 0x00f000f0) | c0); // half2( q[6] * 16 + 1024, q[7] * 16 + 1024 ) - - if (scaled) - { - dq[0] = __hfma2(q0.as_half2, y1y16[0], z1z16[0]); // half2( q[0] * s - z * s, q[1] * s - z * s) - dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]); // half2( q[2] * s - z * s, q[3] * s - z * s) - dq[2] = __hfma2(q2.as_half2, y1y16[0], z1z16[0]); - dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]); - } - else - { - dq[0] = __hadd2(q0.as_half2, z1z16[0]); // half2( q[0] - z, q[1] - z ) - dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]); // half2( q[2] - z, q[3] - z ) - dq[2] = __hadd2(q2.as_half2, z1z16[0]); // half2( q[4] - z, q[5] - z ) - dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]); // half2( q[6] - z, q[7] - z ) - } -} -} // namespace gptq -} // namespace vllm - -#else - -namespace vllm { -namespace gptq { -__forceinline__ __device__ void shuffle_4bit_8 -( - uint32_t* q, - int stride -) -{ -} - -__forceinline__ __device__ void dequant_4bit_8 -( - const uint32_t q_0, - half2 (&dq)[4], - int stride -) -{ - half dqh[8]; - for (int i = 0; i < 8; i++) dqh[i] = dq_ns(exb(q_0, i * 4, 0x0f), 8); - - for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); -} - -__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale -( - const uint32_t zero, - const half scale, - half2 (&z1)[2], - half2 (&y1)[2] -) -{ - half z = __int2half_rn(-((int)zero)); - z = __hmul(z, scale); - z1[0] = __half2half2(z); - y1[0] = __half2half2(scale); -} - -__forceinline__ __device__ void dequant_4bit_8_prep_zero -( - const uint32_t zero, - half2(&z1)[2], - half2(&y1)[2] -) -{ - half z = __int2half_rn(-((int)zero)); - z1[0] = __half2half2(z); -} - -__forceinline__ __device__ void dequant_4bit_8_gptq -( - const uint32_t q_0, - half2 (&dq)[4], - half2 (&z1)[2], - half2 (&y1)[2], - int stride, - bool scaled -) -{ - half2 dqh2[8]; - - uint32_t qa = q_0; - for (int i = 0; i < 4; i++) - { - half d0 = __int2half_rn(qa & 0x0f); qa >>= 4; - half d1 = __int2half_rn(qa & 0x0f); qa >>= 4; - dqh2[i] = __halves2half2(d0, d1); - } - - if (scaled) - { - dq[0] = __hfma2(dqh2[0], y1[0], z1[0]); - dq[1] = __hfma2(dqh2[1], y1[0], z1[0]); - dq[2] = __hfma2(dqh2[2], y1[0], z1[0]); - dq[3] = __hfma2(dqh2[3], y1[0], z1[0]); - } - else - { - dq[0] = __hadd2(dqh2[0], z1[0]); - dq[1] = __hadd2(dqh2[1], z1[0]); - dq[2] = __hadd2(dqh2[2], z1[0]); - dq[3] = __hadd2(dqh2[3], z1[0]); - } -} - -} // namespace gptq -} // namespace vllm - -#endif diff --git a/csrc/quantization/gptq/qdq_util.cuh b/csrc/quantization/gptq/qdq_util.cuh deleted file mode 100644 index 1722a9aa6cb34935ab86bd755300032f4962057a..0000000000000000000000000000000000000000 --- a/csrc/quantization/gptq/qdq_util.cuh +++ /dev/null @@ -1,60 +0,0 @@ -/* -Copied from https://github.com/turboderp/exllamav2 -*/ - -#ifndef _qdq_util_cuh -#define _qdq_util_cuh - -namespace vllm { -namespace gptq { - -union half2_uint32 -{ - uint32_t as_uint32; - half2 as_half2; - __device__ half2_uint32(uint32_t val) : as_uint32(val) {} - __device__ half2_uint32(half2 val) : as_half2(val) {} -}; - -union half_uint16 -{ - uint16_t as_uint16; - half as_half; - __device__ half_uint16(uint16_t val) : as_uint16(val) {} - __device__ half_uint16(half val) : as_half(val) {} -}; - -// Max_scale premultiplied by 1/256 - -__forceinline__ __device__ half dq_scale(const int qs, const half max_scale) -{ - int qs_i = qs + 1; - half qs_h = __int2half_rn(qs_i * qs_i); - qs_h = __hmul(qs_h, max_scale); - return qs_h; -} - -__forceinline__ __device__ half dq(const int q, const int qzero, const half scale) -{ - return __hmul(__int2half_rn(q - qzero), scale); -} - -__forceinline__ __device__ half dq_ns(const int q, const int qzero) -{ - //return __hsub(__int2half_rn(q), __int2half_rn(qzero)); - return __int2half_rn(q - qzero); -} - -__forceinline__ __device__ int exb(const uint32_t q, const int shift, const int mask) -{ - return (int)((q >> shift) & mask); -} - -__forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const int shift, const int mask) -{ - return (int)(__funnelshift_rc(q0, q1, shift) & mask); -} - -} // namespace gptq -} // namespace vllm -#endif diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu deleted file mode 100644 index 09964903622b49dd0837f6de8b939b9884480223..0000000000000000000000000000000000000000 --- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu +++ /dev/null @@ -1,225 +0,0 @@ -#include <torch/all.h> -#include <torch/python.h> -#include <cuda.h> -#include <cuda_runtime.h> -#include <cuda_fp16.h> - -// half-tensor -#include <c10/cuda/CUDAStream.h> -#include <ATen/cuda/CUDATensorMethods.cuh> -#include <c10/cuda/CUDAGuard.h> - -#define BLOCKWIDTH 128 -#define BLOCKHEIGHT4 16 - -namespace vllm { -namespace squeezellm { - -__device__ inline unsigned int as_unsigned(int i) { - return *reinterpret_cast<unsigned int*>(&i); -} - -// 4-bit matvec kernel (LUT-based) -__global__ void NUQ4MatMulKernel( -#ifndef USE_ROCM - const half2* __restrict__ vec, -#else - const __half2* __restrict__ vec, -#endif - const int* __restrict__ mat, -#ifndef USE_ROCM - half2* __restrict__ mul, -#else - float2* __restrict__ mul, -#endif - const __half* __restrict__ lookup_table, - int height, - int width, - int batch, - int vec_height -) { - - const int blockwidth2 = BLOCKWIDTH / 2; - - int row = BLOCKHEIGHT4 * blockIdx.x; - int col = BLOCKWIDTH * blockIdx.y + threadIdx.x; - -#ifndef USE_ROCM - __shared__ half2 blockvec[blockwidth2]; -#else - __shared__ __half2 blockvec[blockwidth2]; -#endif - - __shared__ __half deq2[16][BLOCKWIDTH]; - int off = threadIdx.x; - int column_offset = col * 16; - for (int val = 0; val < 16; val += 1) { - int lut_index = column_offset + val; - deq2[val][off] = lookup_table[lut_index]; - } - - __half res; -#ifndef USE_ROCM - half2 res2; - half2 tmp2; -#else - __half2 res2; - __half2 tmp2; -#endif - - int i; - int k; - - unsigned int tmp1; - unsigned int lut_index1, lut_index2; - - for (int b = 0; b < batch; ++b){ - i = width * row + col; - res = __int2half_rd(0); - k = 0; - - __syncthreads(); - if (threadIdx.x < blockwidth2) - blockvec[threadIdx.x] = vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 + threadIdx.x]; - __syncthreads(); - - while (k < blockwidth2) { - tmp1 = as_unsigned(mat[i]); - -#ifndef USE_ROCM - res2 = {}; - tmp2 = {}; -#else - res2.x = __half_as_ushort(__float2half(0)); - res2.y = __half_as_ushort(__float2half(0)); - tmp2.x = __half_as_ushort(__float2half(0)); - tmp2.y = __half_as_ushort(__float2half(0)); -#endif - - lut_index1 = tmp1 & 0xF; - lut_index2 = (tmp1 >> 4) & 0xF; -#ifndef USE_ROCM - tmp2.x = deq2[lut_index1][off]; - tmp2.y = deq2[lut_index2][off]; -#else - tmp2.x = __half_as_ushort(deq2[lut_index1][off]); - tmp2.y = __half_as_ushort(deq2[lut_index2][off]); -#endif - res2 = __hfma2(tmp2, blockvec[k + 0], res2); - - lut_index1 = (tmp1 >> 8) & 0xF; - lut_index2 = (tmp1 >> 12) & 0xF; -#ifndef USE_ROCM - tmp2.x = deq2[lut_index1][off]; - tmp2.y = deq2[lut_index2][off]; -#else - tmp2.x = __half_as_ushort(deq2[lut_index1][off]); - tmp2.y = __half_as_ushort(deq2[lut_index2][off]); -#endif - res2 = __hfma2(tmp2, blockvec[k + 1], res2); - - lut_index1 = (tmp1 >> 16) & 0xF; - lut_index2 = (tmp1 >> 20) & 0xF; -#ifndef USE_ROCM - tmp2.x = deq2[lut_index1][off]; - tmp2.y = deq2[lut_index2][off]; -#else - tmp2.x = __half_as_ushort(deq2[lut_index1][off]); - tmp2.y = __half_as_ushort(deq2[lut_index2][off]); -#endif - res2 = __hfma2(tmp2, blockvec[k + 2], res2); - - lut_index1 = (tmp1 >> 24) & 0xF; - lut_index2 = (tmp1 >> 28) & 0xF; -#ifndef USE_ROCM - tmp2.x = deq2[lut_index1][off]; - tmp2.y = deq2[lut_index2][off]; -#else - tmp2.x = __half_as_ushort(deq2[lut_index1][off]); - tmp2.y = __half_as_ushort(deq2[lut_index2][off]); -#endif - res2 = __hfma2(tmp2, blockvec[k + 3], res2); - -#ifndef USE_ROCM - res = __hadd(__hadd(res2.x, res2.y), res); -#else - res = __hadd(__hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)), res); -#endif - - i += width; - k += 4; - } - - // col%2 -> only set one of the two values -#ifndef USE_ROCM - half2 res3 = {}; - if (col % 2 == 0) { - res3.x = res; - } else { - res3.y = res; - } -#else - __half2 res3; - res3.x = __half_as_ushort(__float2half(0)); - res3.y = __half_as_ushort(__float2half(0)); - if (col % 2 == 0) { - res3.x = __half_as_ushort(res); - } else { - res3.y = __half_as_ushort(res); - } -#endif - -#ifndef USE_ROCM - atomicAdd(&mul[b * width / 2 + col / 2], res3); -#else - int tmp_addr = b * width / 2 + col / 2; - atomicAdd(&(mul[tmp_addr].x), __half2float(__ushort_as_half(res3.x))); - atomicAdd(&(mul[tmp_addr].y), __half2float(__ushort_as_half(res3.y))); -#endif - } -} - -} // namespace squeezellm -} // namespace vllm - -// 4-bit matvec kernel (LUT-based) -void squeezellm_gemm( - torch::Tensor vec, - torch::Tensor mat, - torch::Tensor mul, - torch::Tensor lookup_table -) { - int height = mat.size(0); - int width = mat.size(1); - - int batch = vec.size(0); - int vec_height = vec.size(1); - - dim3 blocks( - (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4, - (width + BLOCKWIDTH - 1) / BLOCKWIDTH - ); - dim3 threads(BLOCKWIDTH); - - const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads, 0, stream>>>( -#ifndef USE_ROCM - (half2*) vec.data<at::Half>(), -#else - (__half2*) vec.data_ptr<at::Half>(), -#endif - mat.data_ptr<int>(), -#ifndef USE_ROCM - (half2*) mul.data<at::Half>(), - (__half*) lookup_table.data<at::Half>(), -#else - (float2*) mul.data_ptr<float>(), - (__half*) lookup_table.data_ptr<at::Half>(), -#endif - height, width, batch, vec_height - ); -} - -#undef BLOCKWIDTH -#undef BLOCKHEIGHT4 diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh deleted file mode 100644 index b95ccef1620728b89d5b8d9c9a87fcbcd6626eb8..0000000000000000000000000000000000000000 --- a/csrc/reduction_utils.cuh +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh - * Copyright (c) 2023, The vLLM team. - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "cuda_compat.h" - -namespace vllm { - -template<typename T> -__inline__ __device__ T warpReduceSum(T val) { -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) - val += VLLM_SHFL_XOR_SYNC(val, mask); - return val; -} - -/* Calculate the sum of all elements in a block */ -template<typename T> -__inline__ __device__ T blockReduceSum(T val) { - static __shared__ T shared[32]; - int lane = threadIdx.x & 0x1f; - int wid = threadIdx.x >> 5; - - val = warpReduceSum<T>(val); - - if (lane == 0) - shared[wid] = val; - - __syncthreads(); - - // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent - // blockDim.x is not divided by 32 - val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f); - val = warpReduceSum<T>(val); - return val; -} - -} // namespace vllm diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index d0c3cbf1020d5c292abdedf27627c6abe25e2293..0000000000000000000000000000000000000000 --- a/docs/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = source -BUILDDIR = build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 46488c9bb0b928cc9707ebfa56538de6b7768f83..0000000000000000000000000000000000000000 --- a/docs/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# vLLM documents - -## Build the docs - -```bash -# Install dependencies. -pip install -r requirements-docs.txt - -# Build the docs. -make clean -make html -``` - -## Open the docs with your browser - -```bash -python -m http.server -d build/html/ -``` -Launch your browser and open localhost:8000. diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 747ffb7b3033659bdd2d1e6eae41ecb00358a45e..0000000000000000000000000000000000000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=source -set BUILDDIR=build - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.https://www.sphinx-doc.org/ - exit /b 1 -) - -if "%1" == "" goto help - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt deleted file mode 100644 index 95e54bd151850f9f3a9832eddfca6fd52d36645a..0000000000000000000000000000000000000000 --- a/docs/requirements-docs.txt +++ /dev/null @@ -1,3 +0,0 @@ -sphinx == 6.2.1 -sphinx-book-theme == 1.0.1 -sphinx-copybutton == 0.5.2 diff --git a/docs/source/assets/logos/vllm-logo-only-light.png b/docs/source/assets/logos/vllm-logo-only-light.png deleted file mode 100644 index 7aaf1748725945c4616e838484fa46d4bafba46a..0000000000000000000000000000000000000000 Binary files a/docs/source/assets/logos/vllm-logo-only-light.png and /dev/null differ diff --git a/docs/source/assets/logos/vllm-logo-text-dark.png b/docs/source/assets/logos/vllm-logo-text-dark.png deleted file mode 100644 index 959a42fd36c72152254000630dddeef6a84c62bf..0000000000000000000000000000000000000000 Binary files a/docs/source/assets/logos/vllm-logo-text-dark.png and /dev/null differ diff --git a/docs/source/assets/logos/vllm-logo-text-light.png b/docs/source/assets/logos/vllm-logo-text-light.png deleted file mode 100644 index 1ead9972879c29e17160f1d3ead150482467b1e8..0000000000000000000000000000000000000000 Binary files a/docs/source/assets/logos/vllm-logo-text-light.png and /dev/null differ diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100644 index 44c976468ab06ad1ab34b26ba7a9e349ab9d4dcd..0000000000000000000000000000000000000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,96 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. - -import os -import sys -from sphinx.ext import autodoc -import logging - -sys.path.insert(0, os.path.abspath(os.path.join('..', '..'))) - -logger = logging.getLogger(__name__) - -# -- Project information ----------------------------------------------------- - -project = 'vLLM' -copyright = '2023, vLLM Team' -author = 'the vLLM Team' - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.napoleon", - "sphinx.ext.viewcode", - "sphinx.ext.intersphinx", - "sphinx_copybutton", - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [] - -# Exclude the prompt "$" when copying code -copybutton_prompt_text = r"\$ " -copybutton_prompt_is_regexp = True - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_title = project -html_theme = 'sphinx_book_theme' -html_logo = 'assets/logos/vllm-logo-text-light.png' -html_theme_options = { - 'path_to_docs': 'docs/source', - 'repository_url': 'https://github.com/vllm-project/vllm', - 'use_repository_button': True, -} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -# html_static_path = ['_static'] - -# Mock out external dependencies here. -autodoc_mock_imports = [ - "torch", "transformers", "psutil", "aioprometheus", "sentencepiece", - "vllm.cuda_utils", "vllm._C" -] - -for mock_target in autodoc_mock_imports: - if mock_target in sys.modules: - logger.info( - f"Potentially problematic mock target ({mock_target}) found; " - "autodoc_mock_imports cannot mock modules that have already " - "been loaded into sys.modules when the sphinx build starts.") - - -class MockedClassDocumenter(autodoc.ClassDocumenter): - """Remove note about base class when a class is derived from object.""" - - def add_line(self, line: str, source: str, *lineno: int) -> None: - if line == " Bases: :py:class:`object`": - return - super().add_line(line, source, *lineno) - - -autodoc.ClassDocumenter = MockedClassDocumenter diff --git a/docs/source/dev/engine/async_llm_engine.rst b/docs/source/dev/engine/async_llm_engine.rst deleted file mode 100644 index 47db1e0a401b1ccc4a2fc40441326141eb779aec..0000000000000000000000000000000000000000 --- a/docs/source/dev/engine/async_llm_engine.rst +++ /dev/null @@ -1,7 +0,0 @@ - -AsyncLLMEngine -================================= - -.. autoclass:: vllm.engine.async_llm_engine.AsyncLLMEngine - :members: generate, abort - :show-inheritance: diff --git a/docs/source/dev/engine/engine_index.rst b/docs/source/dev/engine/engine_index.rst deleted file mode 100644 index ba9ae55ddea4671f708d9956b41d18cb4e54d13f..0000000000000000000000000000000000000000 --- a/docs/source/dev/engine/engine_index.rst +++ /dev/null @@ -1,13 +0,0 @@ -vLLM Engine -================================= - -.. automodule:: vllm.engine -.. currentmodule:: vllm.engine - -.. toctree:: - :maxdepth: 2 - :caption: Engines - - llm_engine - async_llm_engine - diff --git a/docs/source/dev/engine/llm_engine.rst b/docs/source/dev/engine/llm_engine.rst deleted file mode 100644 index b550a9b5faa62b54218a8f6979300dc2f908e921..0000000000000000000000000000000000000000 --- a/docs/source/dev/engine/llm_engine.rst +++ /dev/null @@ -1,6 +0,0 @@ -LLMEngine -================================= - -.. autoclass:: vllm.engine.llm_engine.LLMEngine - :members: add_request, abort_request, step, _init_cache - :show-inheritance: \ No newline at end of file diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst deleted file mode 100644 index 6851ba136351c8295549c04809a26d400c9d482c..0000000000000000000000000000000000000000 --- a/docs/source/getting_started/amd-installation.rst +++ /dev/null @@ -1,171 +0,0 @@ -.. _installation_rocm: - -Installation with ROCm -====================== - -vLLM 0.2.4 onwards supports model inferencing and serving on AMD GPUs with ROCm. -At the moment AWQ quantization is not supported in ROCm, but SqueezeLLM quantization has been ported. -Data types currently supported in ROCm are FP16 and BF16. - -Requirements ------------- - -* OS: Linux -* Python: 3.8 -- 3.11 -* GPU: MI200s (gfx90a), MI300 (gfx942) -* Pytorch 2.0.1/2.1.1/2.2 -* ROCm 5.7 (Verified on python 3.10) or ROCm 6.0 (Verified on python 3.9) - -Installation options: - -#. :ref:`(Recommended) Quick start with vLLM pre-installed in Docker Image <quick_start_docker_rocm>` -#. :ref:`Build from source <build_from_source_rocm>` -#. :ref:`Build from source with docker <build_from_source_docker_rocm>` - -.. _quick_start_docker_rocm: - -(Recommended) Option 1: Quick start with vLLM pre-installed in Docker Image ---------------------------------------------------------------------------- - -This option is for ROCm 5.7 only: - -.. code-block:: console - - $ docker pull embeddedllminfo/vllm-rocm:vllm-v0.2.4 - $ docker run -it \ - --network=host \ - --group-add=video \ - --ipc=host \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --device /dev/kfd \ - --device /dev/dri \ - -v <path/to/model>:/app/model \ - embeddedllminfo/vllm-rocm \ - bash - - -.. _build_from_source_rocm: - -Option 2: Build from source ---------------------------- - -You can build and install vLLM from source: - -Below instruction is for ROCm 5.7 only. -At the time of this documentation update, PyTorch on ROCm 6.0 wheel is not yet available on the PyTorch website. - -0. Install prerequisites (skip if you are already in an environment/docker with the following installed): - -- `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_ -- `Pytorch <https://pytorch.org/>`_ - - .. code-block:: console - - $ pip install torch==2.2.0.dev20231206+rocm5.7 --index-url https://download.pytorch.org/whl/nightly/rocm5.7 # tested version - - -1. Install `flash attention for ROCm <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm>`_ - - Install ROCm's flash attention (v2.0.4) following the instructions from `ROCmSoftwarePlatform/flash-attention <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_ - -.. note:: - - If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly. - - If you fail to install `ROCmSoftwarePlatform/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`. - - ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention. - - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) - -2. Setup `xformers==0.0.23` without dependencies, and apply patches to adapt for ROCm flash attention - - .. code-block:: console - - $ pip install xformers==0.0.23 --no-deps - $ bash patch_xformers.rocm.sh - -3. Build vLLM. - - .. code-block:: console - - $ cd vllm - $ pip install -U -r requirements-rocm.txt - $ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation - - -.. _build_from_source_docker_rocm: - -Option 3: Build from source with docker ------------------------------------------------------ - -You can build and install vLLM from source: - -Build a docker image from `Dockerfile.rocm`, and launch a docker container. - -The `Dokerfile.rocm` is designed to support both ROCm 5.7 and ROCm 6.0 and later versions. It provides flexibility to customize the build of docker image using the following arguments: - -* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1` -* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` -* `FA_BRANCH`: specifies the branch used to build the flash-attention in `ROCmSoftwarePlatform's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `3d2b6f5` - -Their values can be passed in when running ``docker build`` with ``--build-arg`` options. - -For example, to build docker image for vllm on ROCm 5.7, you can run: - -.. code-block:: console - - $ docker build --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \ - -f Dockerfile.rocm -t vllm-rocm . - -To build vllm on ROCm 6.0, you can use the default: - -.. code-block:: console - - $ docker build -f Dockerfile.rocm -t vllm-rocm . - $ docker run -it \ - --network=host \ - --group-add=video \ - --ipc=host \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --device /dev/kfd \ - --device /dev/dri \ - -v <path/to/model>:/app/model \ - vllm-rocm \ - bash - -Alternatively, if you plan to install vLLM-ROCm on a local machine or start from a fresh docker image (e.g. rocm/pytorch), you can follow the steps below: - -0. Install prerequisites (skip if you are already in an environment/docker with the following installed): - -- `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_ -- `Pytorch <https://pytorch.org/>`_ -- `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_ - -1. Install `flash attention for ROCm <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm>`_ - - Install ROCm's flash attention (v2.0.4) following the instructions from `ROCmSoftwarePlatform/flash-attention <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_ - -.. note:: - - If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly. - - If you fail to install `ROCmSoftwarePlatform/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`. - - ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention. - - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) - -2. Setup `xformers==0.0.23` without dependencies, and apply patches to adapt for ROCm flash attention - - .. code-block:: console - - $ pip install xformers==0.0.23 --no-deps - $ bash patch_xformers.rocm.sh - -3. Build vLLM. - - .. code-block:: console - - $ cd vllm - $ pip install -U -r requirements-rocm.txt - $ python setup.py install # This may take 5-10 minutes. - -.. note:: - - - You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation. - diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst deleted file mode 100644 index 911c3d8f9a4ab0ab3d4116fcf34c24030008bc3a..0000000000000000000000000000000000000000 --- a/docs/source/getting_started/installation.rst +++ /dev/null @@ -1,69 +0,0 @@ -.. _installation: - -Installation -============ - -vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. - -Requirements ------------- - -* OS: Linux -* Python: 3.8 -- 3.11 -* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) - -Install with pip ----------------- - -You can install vLLM using pip: - -.. code-block:: console - - $ # (Optional) Create a new conda environment. - $ conda create -n myenv python=3.9 -y - $ conda activate myenv - - $ # Install vLLM with CUDA 12.1. - $ pip install vllm - -.. note:: - - As of now, vLLM's binaries are compiled on CUDA 12.1 by default. - However, you can install vLLM with CUDA 11.8 by running: - - .. code-block:: console - - $ # Install vLLM with CUDA 11.8. - $ export VLLM_VERSION=0.2.4 - $ export PYTHON_VERSION=39 - $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl - - $ # Re-install PyTorch with CUDA 11.8. - $ pip uninstall torch -y - $ pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu118 - - $ # Re-install xFormers with CUDA 11.8. - $ pip uninstall xformers -y - $ pip install --upgrade xformers --index-url https://download.pytorch.org/whl/cu118 - - -.. _build_from_source: - -Build from source ------------------ - -You can also build and install vLLM from source: - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ pip install -e . # This may take 5-10 minutes. - -.. tip:: - If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. - - .. code-block:: console - - $ # Use `--ipc=host` to make sure the shared memory is large enough. - $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst deleted file mode 100644 index 7c44a96865a50be7649aa188e9d8d053d1ff48ae..0000000000000000000000000000000000000000 --- a/docs/source/getting_started/quickstart.rst +++ /dev/null @@ -1,176 +0,0 @@ -.. _quickstart: - -Quickstart -========== - -This guide shows how to use vLLM to: - -* run offline batched inference on a dataset; -* build an API server for a large language model; -* start an OpenAI-compatible API server. - -Be sure to complete the :ref:`installation instructions <installation>` before continuing with this guide. - -.. note:: - - By default, vLLM downloads model from `HuggingFace <https://huggingface.co/>`_. If you would like to use models from `ModelScope <https://www.modelscope.cn>`_ in the following examples, please set the environment variable: - - .. code-block:: shell - - export VLLM_USE_MODELSCOPE=True - -Offline Batched Inference -------------------------- - -We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts. - -Import ``LLM`` and ``SamplingParams`` from vLLM. The ``LLM`` class is the main class for running offline inference with vLLM engine. The ``SamplingParams`` class specifies the parameters for the sampling process. - -.. code-block:: python - - from vllm import LLM, SamplingParams - -Define the list of input prompts and the sampling parameters for generation. The sampling temperature is set to 0.8 and the nucleus sampling probability is set to 0.95. For more information about the sampling parameters, refer to the `class definition <https://github.com/vllm-project/vllm/blob/main/vllm/sampling_params.py>`_. - -.. code-block:: python - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -Initialize vLLM's engine for offline inference with the ``LLM`` class and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_. The list of supported models can be found at :ref:`supported models <supported_models>`. - -.. code-block:: python - - llm = LLM(model="facebook/opt-125m") - -Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens. - -.. code-block:: python - - outputs = llm.generate(prompts, sampling_params) - - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -The code example can also be found in `examples/offline_inference.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py>`_. - -OpenAI-Compatible Server ------------------------- - -vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. -By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the command below) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_, `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_, and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints. - -Start the server: - -.. code-block:: console - - $ python -m vllm.entrypoints.openai.api_server \ - $ --model facebook/opt-125m - -By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument: - -.. code-block:: console - - $ python -m vllm.entrypoints.openai.api_server \ - $ --model facebook/opt-125m \ - $ --chat-template ./examples/template_chatml.jinja - -This server can be queried in the same format as OpenAI API. For example, list the models: - -.. code-block:: console - - $ curl http://localhost:8000/v1/models - -You can pass in the argument ``--api-key`` or environment variable ``VLLM_API_KEY`` to enable the server to check for API key in the header. - -Using OpenAI Completions API with vLLM -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Query the model with input prompts: - -.. code-block:: console - - $ curl http://localhost:8000/v1/completions \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "model": "facebook/opt-125m", - $ "prompt": "San Francisco is a", - $ "max_tokens": 7, - $ "temperature": 0 - $ }' - -Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the ``openai`` python package: - -.. code-block:: python - - from openai import OpenAI - - # Modify OpenAI's API key and API base to use vLLM's API server. - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - completion = client.completions.create(model="facebook/opt-125m", - prompt="San Francisco is a") - print("Completion result:", completion) - -For a more detailed client example, refer to `examples/openai_completion_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py>`_. - -Using OpenAI Chat API with vLLM -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The vLLM server is designed to support the OpenAI Chat API, allowing you to engage in dynamic conversations with the model. The chat interface is a more interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. - -Querying the model using OpenAI Chat API: - -You can use the `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_ endpoint to communicate with the model in a chat-like interface: - -.. code-block:: console - - $ curl http://localhost:8000/v1/chat/completions \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "model": "facebook/opt-125m", - $ "messages": [ - $ {"role": "system", "content": "You are a helpful assistant."}, - $ {"role": "user", "content": "Who won the world series in 2020?"} - $ ] - $ }' - -Python Client Example: - -Using the `openai` python package, you can also communicate with the model in a chat-like manner: - -.. code-block:: python - - from openai import OpenAI - # Set OpenAI's API key and API base to use vLLM's API server. - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - - chat_response = client.chat.completions.create( - model="facebook/opt-125m", - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Tell me a joke."}, - ] - ) - print("Chat response:", chat_response) - -For more in-depth examples and advanced features of the chat API, you can refer to the official OpenAI documentation. diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index 321f855645bb8c37cdef2400b66545e0fbd89abb..0000000000000000000000000000000000000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,100 +0,0 @@ -Welcome to vLLM! -================ - -.. figure:: ./assets/logos/vllm-logo-text-light.png - :width: 60% - :align: center - :alt: vLLM - :class: no-scaled-link - -.. raw:: html - - <p style="text-align:center"> - <strong>Easy, fast, and cheap LLM serving for everyone - </strong> - </p> - - <p style="text-align:center"> - <script async defer src="https://buttons.github.io/buttons.js"></script> - <a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a> - <a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a> - <a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a> - </p> - - - -vLLM is a fast and easy-to-use library for LLM inference and serving. - -vLLM is fast with: - -* State-of-the-art serving throughput -* Efficient management of attention key and value memory with **PagedAttention** -* Continuous batching of incoming requests -* Fast model execution with CUDA/HIP graph -* Quantization: `GPTQ <https://arxiv.org/abs/2210.17323>`_, `AWQ <https://arxiv.org/abs/2306.00978>`_, `SqueezeLLM <https://arxiv.org/abs/2306.07629>`_ -* Optimized CUDA kernels - -vLLM is flexible and easy to use with: - -* Seamless integration with popular HuggingFace models -* High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more -* Tensor parallelism support for distributed inference -* Streaming outputs -* OpenAI-compatible API server -* Support NVIDIA GPUs and AMD GPUs - -For more information, check out the following: - -* `vLLM announcing blog post <https://vllm.ai>`_ (intro to PagedAttention) -* `vLLM paper <https://arxiv.org/abs/2309.06180>`_ (SOSP 2023) -* `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency <https://www.anyscale.com/blog/continuous-batching-llm-inference>`_ by Cade Daniel et al. - - - -Documentation -------------- - -.. toctree:: - :maxdepth: 1 - :caption: Getting Started - - getting_started/installation - getting_started/amd-installation - getting_started/quickstart - -.. toctree:: - :maxdepth: 1 - :caption: Serving - - serving/distributed_serving - serving/run_on_sky - serving/deploying_with_triton - serving/deploying_with_docker - serving/serving_with_langchain - serving/metrics - -.. toctree:: - :maxdepth: 1 - :caption: Models - - models/supported_models - models/adding_model - models/engine_args - -.. toctree:: - :maxdepth: 1 - :caption: Quantization - - quantization/auto_awq - -.. toctree:: - :maxdepth: 2 - :caption: Developer Documentation - - dev/engine/engine_index - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst deleted file mode 100644 index bf243a044769fde370f3856d296837492d1ca51f..0000000000000000000000000000000000000000 --- a/docs/source/models/adding_model.rst +++ /dev/null @@ -1,96 +0,0 @@ -.. _adding_a_new_model: - -Adding a New Model -================== - -This document provides a high-level guide on integrating a `HuggingFace Transformers <https://github.com/huggingface/transformers>`_ model into vLLM. - -.. note:: - The complexity of adding a new model depends heavily on the model's architecture. - The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. - However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. - -.. tip:: - If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository. - We will be happy to help you out! - - -0. Fork the vLLM repository --------------------------------- - -Start by forking our `GitHub`_ repository and then :ref:`build it from source <build_from_source>`. -This gives you the ability to modify the codebase and test your model. - - -1. Bring your model code ------------------------- - -Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `vllm/model_executor/models <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models>`_ directory. -For instance, vLLM's `OPT model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/opt.py>`_ was adapted from the HuggingFace's `modeling_opt.py <https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py>`_ file. - -.. warning:: - When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. - - -2. Rewrite the :code:`forward` methods --------------------------------------- - -Next, you need to rewrite the :code:`forward` methods of your model by following these steps: - -1. Remove any unnecessary code, such as the code only used for training. -2. Change the input parameters: - -.. code-block:: diff - - def forward( - self, - input_ids: torch.Tensor, - - attention_mask: Optional[torch.Tensor] = None, - - position_ids: Optional[torch.LongTensor] = None, - - past_key_values: Optional[List[torch.FloatTensor]] = None, - - inputs_embeds: Optional[torch.FloatTensor] = None, - - labels: Optional[torch.LongTensor] = None, - - use_cache: Optional[bool] = None, - - output_attentions: Optional[bool] = None, - - output_hidden_states: Optional[bool] = None, - - return_dict: Optional[bool] = None, - -) -> Union[Tuple, CausalLMOutputWithPast]: - + positions: torch.Tensor, - + kv_caches: List[KVCache], - + input_metadata: InputMetadata, - +) -> Optional[SamplerOutput]: - -1. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors. -2. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture. - -.. note:: - Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. - If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. - - -3. (Optional) Implement tensor parallelism and quantization support -------------------------------------------------------------------- - -If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. -To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. -For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`. -When it comes to the linear layers, we provide the following options to parallelize them: - -* :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. -* :code:`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer. -* :code:`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer. -* :code:`MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. -* :code:`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices. - -Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. - -4. Implement the weight loading logic -------------------------------------- - -You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class. -This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. - -5. Register your model ----------------------- - -Finally, include your :code:`*ForCausalLM` class in `vllm/model_executor/models/__init__.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/__init__.py>`_ and register it to the :code:`_MODEL_REGISTRY` in `vllm/model_executor/model_loader.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/model_loader.py>`_. diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst deleted file mode 100644 index d89b79514950141c10565df9beb73be2ba1212aa..0000000000000000000000000000000000000000 --- a/docs/source/models/engine_args.rst +++ /dev/null @@ -1,116 +0,0 @@ -.. _engine_args: - -Engine Arguments -================ - -Below, you can find an explanation of every engine argument for vLLM: - -.. option:: --model <model_name_or_path> - - Name or path of the huggingface model to use. - -.. option:: --tokenizer <tokenizer_name_or_path> - - Name or path of the huggingface tokenizer to use. - -.. option:: --revision <revision> - - The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. - -.. option:: --tokenizer-revision <revision> - - The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. - -.. option:: --tokenizer-mode {auto,slow} - - The tokenizer mode. - - * "auto" will use the fast tokenizer if available. - * "slow" will always use the slow tokenizer. - -.. option:: --trust-remote-code - - Trust remote code from huggingface. - -.. option:: --download-dir <directory> - - Directory to download and load the weights, default to the default cache dir of huggingface. - -.. option:: --load-format {auto,pt,safetensors,npcache,dummy} - - The format of the model weights to load. - - * "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available. - * "pt" will load the weights in the pytorch bin format. - * "safetensors" will load the weights in the safetensors format. - * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading. - * "dummy" will initialize the weights with random values, mainly for profiling. - -.. option:: --dtype {auto,half,float16,bfloat16,float,float32} - - Data type for model weights and activations. - - * "auto" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models. - * "half" for FP16. Recommended for AWQ quantization. - * "float16" is the same as "half". - * "bfloat16" for a balance between precision and range. - * "float" is shorthand for FP32 precision. - * "float32" for FP32 precision. - -.. option:: --max-model-len <length> - - Model context length. If unspecified, will be automatically derived from the model config. - -.. option:: --worker-use-ray - - Use Ray for distributed serving, will be automatically set when using more than 1 GPU. - -.. option:: --pipeline-parallel-size (-pp) <size> - - Number of pipeline stages. - -.. option:: --tensor-parallel-size (-tp) <size> - - Number of tensor parallel replicas. - -.. option:: --max-parallel-loading-workers <workers> - - Load model sequentially in multiple batches, to avoid RAM OOM when using tensor parallel and large models. - -.. option:: --block-size {8,16,32} - - Token block size for contiguous chunks of tokens. - -.. option:: --seed <seed> - - Random seed for operations. - -.. option:: --swap-space <size> - - CPU swap space size (GiB) per GPU. - -.. option:: --gpu-memory-utilization <fraction> - - The fraction of GPU memory to be used for the model executor, which can range from 0 to 1. - For example, a value of 0.5 would imply 50% GPU memory utilization. - If unspecified, will use the default value of 0.9. - -.. option:: --max-num-batched-tokens <tokens> - - Maximum number of batched tokens per iteration. - -.. option:: --max-num-seqs <sequences> - - Maximum number of sequences per iteration. - -.. option:: --max-paddings <paddings> - - Maximum number of paddings in a batch. - -.. option:: --disable-log-stats - - Disable logging statistics. - -.. option:: --quantization (-q) {awq,squeezellm,None} - - Method used to quantize the weights. diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst deleted file mode 100644 index 2663cf2366e64e52b70a19f1a220c92aaa9ad334..0000000000000000000000000000000000000000 --- a/docs/source/models/supported_models.rst +++ /dev/null @@ -1,116 +0,0 @@ -.. _supported_models: - -Supported Models -================ - -vLLM supports a variety of generative Transformer models in `HuggingFace Transformers <https://huggingface.co/models>`_. -The following is the list of model architectures that are currently supported by vLLM. -Alongside each architecture, we include some popular models that use it. - -.. list-table:: - :widths: 25 25 50 - :header-rows: 1 - - * - Architecture - - Models - - Example HuggingFace Models - * - :code:`AquilaForCausalLM` - - Aquila - - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. - * - :code:`BaiChuanForCausalLM` - - Baichuan - - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc. - * - :code:`ChatGLMModel` - - ChatGLM - - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. - * - :code:`DeciLMForCausalLM` - - DeciLM - - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc. - * - :code:`BloomForCausalLM` - - BLOOM, BLOOMZ, BLOOMChat - - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc. - * - :code:`FalconForCausalLM` - - Falcon - - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. - * - :code:`GPT2LMHeadModel` - - GPT-2 - - :code:`gpt2`, :code:`gpt2-xl`, etc. - * - :code:`GPTBigCodeForCausalLM` - - StarCoder, SantaCoder, WizardCoder - - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. - * - :code:`GPTJForCausalLM` - - GPT-J - - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. - * - :code:`GPTNeoXForCausalLM` - - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM - - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc. - * - :code:`InternLMForCausalLM` - - InternLM - - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc. - * - :code:`LlamaForCausalLM` - - LLaMA, LLaMA-2, Vicuna, Alpaca, Koala, Guanaco - - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`young-geng/koala`, etc. - * - :code:`MistralForCausalLM` - - Mistral, Mistral-Instruct - - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc. - * - :code:`MixtralForCausalLM` - - Mixtral-8x7B, Mixtral-8x7B-Instruct - - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, etc. - * - :code:`MPTForCausalLM` - - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter - - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. - * - :code:`OPTForCausalLM` - - OPT, OPT-IML - - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. - * - :code:`PhiForCausalLM` - - Phi - - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc. - * - :code:`QWenLMHeadModel` - - Qwen - - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. - * - :code:`Qwen2ForCausalLM` - - Qwen2 - - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc. - * - :code:`StableLMEpochForCausalLM` - - StableLM - - :code:`stabilityai/stablelm-3b-4e1t/` , :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc. - * - :code:`YiForCausalLM` - - Yi - - :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc. - -If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. -Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model. -Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ project. - -.. note:: - Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. - -.. tip:: - The easiest way to check if your model is supported is to run the program below: - - .. code-block:: python - - from vllm import LLM - - llm = LLM(model=...) # Name or path of your model - output = llm.generate("Hello, my name is") - print(output) - - If vLLM successfully generates text, it indicates that your model is supported. - -.. tip:: - To use models from `ModelScope <https://www.modelscope.cn>`_ instead of HuggingFace Hub, set an environment variable: - - .. code-block:: shell - - $ export VLLM_USE_MODELSCOPE=True - - And use with :code:`trust_remote_code=True`. - - .. code-block:: python - - from vllm import LLM - - llm = LLM(model=..., revision=..., trust_remote_code=True) # Name or path of your model - output = llm.generate("Hello, my name is") - print(output) diff --git a/docs/source/quantization/auto_awq.rst b/docs/source/quantization/auto_awq.rst deleted file mode 100644 index bbbb9aee78b3cdafa003a122ec80dae03b63ad76..0000000000000000000000000000000000000000 --- a/docs/source/quantization/auto_awq.rst +++ /dev/null @@ -1,75 +0,0 @@ -.. _auto_awq: - -AutoAWQ -================== - -.. warning:: - - Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better - accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency - inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version. - -To create a new 4-bit quantized model, you can leverage `AutoAWQ <https://github.com/casper-hansen/AutoAWQ>`_. -Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%. -The main benefits are lower latency and memory usage. - -You can quantize your own models by installing AutoAWQ or picking one of the `400+ models on Huggingface <https://huggingface.co/models?sort=trending&search=awq>`_. - -.. code-block:: console - - $ pip install autoawq - -After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize Vicuna 7B v1.5: - -.. code-block:: python - - from awq import AutoAWQForCausalLM - from transformers import AutoTokenizer - - model_path = 'lmsys/vicuna-7b-v1.5' - quant_path = 'vicuna-7b-v1.5-awq' - quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } - - # Load model - model = AutoAWQForCausalLM.from_pretrained(model_path, **{"low_cpu_mem_usage": True}) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - - # Quantize - model.quantize(tokenizer, quant_config=quant_config) - - # Save quantized model - model.save_quantized(quant_path) - tokenizer.save_pretrained(quant_path) - -To run an AWQ model with vLLM, you can use `TheBloke/Llama-2-7b-Chat-AWQ <https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ>`_ with the following command: - -.. code-block:: console - - $ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq - -AWQ models are also supported directly through the LLM entrypoint: - -.. code-block:: python - - from vllm import LLM, SamplingParams - - # Sample prompts. - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - # Create a sampling params object. - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - # Create an LLM. - llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ") - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/docs/source/quantization/fp8_e5m2_kv_cache.rst b/docs/source/quantization/fp8_e5m2_kv_cache.rst deleted file mode 100644 index 10437260ad964e9720ef7dfeebe6791637d77b00..0000000000000000000000000000000000000000 --- a/docs/source/quantization/fp8_e5m2_kv_cache.rst +++ /dev/null @@ -1,32 +0,0 @@ -.. _fp8_e5m2_kv_cache: - -FP8 E5M2 KV Cache -================== - -The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. -The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other. - -Here is an example of how to enable this feature: - -.. code-block:: python - from vllm import LLM, SamplingParams - # Sample prompts. - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - # Create a sampling params object. - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - # Create an LLM. - llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8_e5m2") - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst deleted file mode 100644 index 7ec769630300dcca8ce584cffaddf3cba1ba98ec..0000000000000000000000000000000000000000 --- a/docs/source/serving/deploying_with_docker.rst +++ /dev/null @@ -1,51 +0,0 @@ -.. _deploying_with_docker: - -Deploying with Docker -============================ - -vLLM offers official docker image for deployment. -The image can be used to run OpenAI compatible server. -The image is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_. - -.. code-block:: console - - $ docker run --runtime nvidia --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=<secret>" \ - -p 8000:8000 \ - --ipc=host \ - vllm/vllm-openai:latest \ - --model mistralai/Mistral-7B-v0.1 - - -.. note:: - - You can either use the ``ipc=host`` flag or ``--shm-size`` flag to allow the - container to access the host's shared memory. vLLM uses PyTorch, which uses shared - memory to share data between processes under the hood, particularly for tensor parallel inference. - - -You can build and run vLLM from source via the provided dockerfile. To build vLLM: - -.. code-block:: console - - $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 - - -.. note:: - - By default vLLM will build for all GPU types for widest distribution. If you are just building for the - current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""`` - for vLLM to find the current GPU type and build for that. - - -To run vLLM: - -.. code-block:: console - - $ docker run --runtime nvidia --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - -p 8000:8000 \ - --env "HUGGING_FACE_HUB_TOKEN=<secret>" \ - vllm/vllm-openai <args...> - diff --git a/docs/source/serving/deploying_with_triton.rst b/docs/source/serving/deploying_with_triton.rst deleted file mode 100644 index 5ce7c3d03dd2d0a871ce022d7283bf4bf0497cf0..0000000000000000000000000000000000000000 --- a/docs/source/serving/deploying_with_triton.rst +++ /dev/null @@ -1,6 +0,0 @@ -.. _deploying_with_triton: - -Deploying with NVIDIA Triton -============================ - -The `Triton Inference Server <https://github.com/triton-inference-server>`_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m <https://huggingface.co/facebook/opt-125m>`_ model using vLLM. Please see `Deploying a vLLM model in Triton <https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton>`_ for more details. diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst deleted file mode 100644 index 4f36dca15d7d1d27581e5cb11a2e927993d98aab..0000000000000000000000000000000000000000 --- a/docs/source/serving/distributed_serving.rst +++ /dev/null @@ -1,38 +0,0 @@ -.. _distributed_serving: - -Distributed Inference and Serving -================================= - -vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with `Ray <https://github.com/ray-project/ray>`_. To run distributed inference, install Ray with: - -.. code-block:: console - - $ pip install ray - -To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: - -.. code-block:: python - - from vllm import LLM - llm = LLM("facebook/opt-13b", tensor_parallel_size=4) - output = llm.generate("San Franciso is a") - -To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: - -.. code-block:: console - - $ python -m vllm.entrypoints.api_server \ - $ --model facebook/opt-13b \ - $ --tensor-parallel-size 4 - -To scale vLLM beyond a single machine, start a `Ray runtime <https://docs.ray.io/en/latest/ray-core/starting-ray.html>`_ via CLI before running vLLM: - -.. code-block:: console - - $ # On head node - $ ray start --head - - $ # On worker nodes - $ ray start --address=<ray-head-address> - -After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines. \ No newline at end of file diff --git a/docs/source/serving/metrics.rst b/docs/source/serving/metrics.rst deleted file mode 100644 index 15e57bd3fec6530fd4265f59f02132c6938f3087..0000000000000000000000000000000000000000 --- a/docs/source/serving/metrics.rst +++ /dev/null @@ -1,13 +0,0 @@ -Production Metrics -================== - -vLLM exposes a number of metrics that can be used to monitor the health of the -system. These metrics are exposed via the `/metrics` endpoint on the vLLM -OpenAI compatible API server. - -The following metrics are exposed: - -.. literalinclude:: ../../../vllm/engine/metrics.py - :language: python - :start-after: begin-metrics-definitions - :end-before: end-metrics-definitions diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst deleted file mode 100644 index 2c88d24dc5d0b2e251d4d52871f08fabc553f1c0..0000000000000000000000000000000000000000 --- a/docs/source/serving/run_on_sky.rst +++ /dev/null @@ -1,69 +0,0 @@ -.. _on_cloud: - -Running on clouds with SkyPilot -=============================== - -.. raw:: html - - <p align="center"> - <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/> - </p> - -vLLM can be run on the cloud to scale to multiple GPUs with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud. - -To install SkyPilot and setup your cloud credentials, run: - -.. code-block:: console - - $ pip install skypilot - $ sky check - -See the vLLM SkyPilot YAML for serving, `serving.yaml <https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml>`__. - -.. code-block:: yaml - - resources: - accelerators: A100 - - envs: - MODEL_NAME: decapoda-research/llama-13b-hf - TOKENIZER: hf-internal-testing/llama-tokenizer - - setup: | - conda create -n vllm python=3.9 -y - conda activate vllm - git clone https://github.com/vllm-project/vllm.git - cd vllm - pip install . - pip install gradio - - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.api_server \ - --model $MODEL_NAME \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - --tokenizer $TOKENIZER 2>&1 | tee api_server.log & - echo 'Waiting for vllm api server to start...' - while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done - echo 'Starting gradio server...' - python vllm/examples/gradio_webserver.py - -Start the serving the LLaMA-13B model on an A100 GPU: - -.. code-block:: console - - $ sky launch serving.yaml - -Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion. - -.. code-block:: console - - (task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live - -**Optional**: Serve the 65B model instead of the default 13B and use more GPU: - -.. code-block:: console - - sky launch -c vllm-serve-new -s serve.yaml --gpus A100:8 --env MODEL_NAME=decapoda-research/llama-65b-hf - diff --git a/docs/source/serving/serving_with_langchain.rst b/docs/source/serving/serving_with_langchain.rst deleted file mode 100644 index 2e1ce688290ad88b915ad389764b2d62e52cb058..0000000000000000000000000000000000000000 --- a/docs/source/serving/serving_with_langchain.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. _run_on_langchain: - -Serving with Langchain -============================ - -vLLM is also available via `Langchain <https://github.com/langchain-ai/langchain>`_ . - -To install langchain, run - -.. code-block:: console - - $ pip install langchain -q - -To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``. - -.. code-block:: python - - from langchain.llms import VLLM - - llm = VLLM(model="mosaicml/mpt-7b", - trust_remote_code=True, # mandatory for hf models - max_new_tokens=128, - top_k=10, - top_p=0.95, - temperature=0.8, - # tensor_parallel_size=... # for distributed inference - ) - - print(llm("What is the capital of France ?")) - -Please refer to this `Tutorial <https://github.com/langchain-ai/langchain/blob/master/docs/docs/integrations/llms/vllm.ipynb>`_ for more details. \ No newline at end of file diff --git a/examples/api_client.py b/examples/api_client.py deleted file mode 100644 index 70ec8c5492124ac0e4c7dc01857a6bde7c4c92da..0000000000000000000000000000000000000000 --- a/examples/api_client.py +++ /dev/null @@ -1,77 +0,0 @@ -"""Example Python client for vllm.entrypoints.api_server""" - -import argparse -import json -from typing import Iterable, List - -import requests - - -def clear_line(n: int = 1) -> None: - LINE_UP = '\033[1A' - LINE_CLEAR = '\x1b[2K' - for _ in range(n): - print(LINE_UP, end=LINE_CLEAR, flush=True) - - -def post_http_request(prompt: str, - api_url: str, - n: int = 1, - stream: bool = False) -> requests.Response: - headers = {"User-Agent": "Test Client"} - pload = { - "prompt": prompt, - "n": n, - "use_beam_search": True, - "temperature": 0.0, - "max_tokens": 16, - "stream": stream, - } - response = requests.post(api_url, headers=headers, json=pload, stream=True) - return response - - -def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: - for chunk in response.iter_lines(chunk_size=8192, - decode_unicode=False, - delimiter=b"\0"): - if chunk: - data = json.loads(chunk.decode("utf-8")) - output = data["text"] - yield output - - -def get_response(response: requests.Response) -> List[str]: - data = json.loads(response.content) - output = data["text"] - return output - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--host", type=str, default="localhost") - parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--n", type=int, default=4) - parser.add_argument("--prompt", type=str, default="San Francisco is a") - parser.add_argument("--stream", action="store_true") - args = parser.parse_args() - prompt = args.prompt - api_url = f"http://{args.host}:{args.port}/generate" - n = args.n - stream = args.stream - - print(f"Prompt: {prompt!r}\n", flush=True) - response = post_http_request(prompt, api_url, n, stream) - - if stream: - num_printed_lines = 0 - for h in get_streaming_response(response): - clear_line(num_printed_lines) - num_printed_lines = 0 - for i, line in enumerate(h): - num_printed_lines += 1 - print(f"Beam candidate {i}: {line!r}", flush=True) - else: - output = get_response(response) - for i, line in enumerate(output): - print(f"Beam candidate {i}: {line!r}", flush=True) diff --git a/examples/gradio_openai_chatbot_webserver.py b/examples/gradio_openai_chatbot_webserver.py deleted file mode 100644 index 61e91d6b0c8b647a68721844cb039dbc524a2966..0000000000000000000000000000000000000000 --- a/examples/gradio_openai_chatbot_webserver.py +++ /dev/null @@ -1,81 +0,0 @@ -import argparse -from openai import OpenAI -import gradio as gr - -# Argument parser setup -parser = argparse.ArgumentParser( - description='Chatbot Interface with Customizable Parameters') -parser.add_argument('--model-url', - type=str, - default='http://localhost:8000/v1', - help='Model URL') -parser.add_argument('-m', - '--model', - type=str, - required=True, - help='Model name for the chatbot') -parser.add_argument('--temp', - type=float, - default=0.8, - help='Temperature for text generation') -parser.add_argument('--stop-token-ids', - type=str, - default='', - help='Comma-separated stop token IDs') -parser.add_argument("--host", type=str, default=None) -parser.add_argument("--port", type=int, default=8001) - -# Parse the arguments -args = parser.parse_args() - -# Set OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = args.model_url - -# Create an OpenAI client to interact with the API server -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) - - -def predict(message, history): - # Convert chat history to OpenAI format - history_openai_format = [{ - "role": "system", - "content": "You are a great ai assistant." - }] - for human, assistant in history: - history_openai_format.append({"role": "user", "content": human}) - history_openai_format.append({ - "role": "assistant", - "content": assistant - }) - history_openai_format.append({"role": "user", "content": message}) - - # Create a chat completion request and send it to the API server - stream = client.chat.completions.create( - model=args.model, # Model name to use - messages=history_openai_format, # Chat history - temperature=args.temp, # Temperature for text generation - stream=True, # Stream response - extra_body={ - 'repetition_penalty': - 1, - 'stop_token_ids': [ - int(id.strip()) for id in args.stop_token_ids.split(',') - if id.strip() - ] if args.stop_token_ids else [] - }) - - # Read and return generated text from response stream - partial_message = "" - for chunk in stream: - partial_message += (chunk.choices[0].delta.content or "") - yield partial_message - - -# Create and launch a chat interface with Gradio -gr.ChatInterface(predict).queue().launch(server_name=args.host, - server_port=args.port, - share=True) diff --git a/examples/gradio_webserver.py b/examples/gradio_webserver.py deleted file mode 100644 index 54e907582986f7b9c451646489d08ef30b72cd63..0000000000000000000000000000000000000000 --- a/examples/gradio_webserver.py +++ /dev/null @@ -1,52 +0,0 @@ -import argparse -import json - -import gradio as gr -import requests - - -def http_bot(prompt): - headers = {"User-Agent": "vLLM Client"} - pload = { - "prompt": prompt, - "stream": True, - "max_tokens": 128, - } - response = requests.post(args.model_url, - headers=headers, - json=pload, - stream=True) - - for chunk in response.iter_lines(chunk_size=8192, - decode_unicode=False, - delimiter=b"\0"): - if chunk: - data = json.loads(chunk.decode("utf-8")) - output = data["text"][0] - yield output - - -def build_demo(): - with gr.Blocks() as demo: - gr.Markdown("# vLLM text completion demo\n") - inputbox = gr.Textbox(label="Input", - placeholder="Enter text and press ENTER") - outputbox = gr.Textbox(label="Output", - placeholder="Generated result from the model") - inputbox.submit(http_bot, [inputbox], [outputbox]) - return demo - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--host", type=str, default=None) - parser.add_argument("--port", type=int, default=8001) - parser.add_argument("--model-url", - type=str, - default="http://localhost:8000/generate") - args = parser.parse_args() - - demo = build_demo() - demo.queue().launch(server_name=args.host, - server_port=args.port, - share=True) diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py deleted file mode 100644 index cea15f8045d8a4953848aee29fbc47a4e4036339..0000000000000000000000000000000000000000 --- a/examples/llm_engine_example.py +++ /dev/null @@ -1,62 +0,0 @@ -import argparse -from typing import List, Tuple - -from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput - - -def create_test_prompts() -> List[Tuple[str, SamplingParams]]: - """Create a list of test prompts with their sampling parameters.""" - return [ - ("A robot may not injure a human being", - SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)), - ("To be or not to be,", - SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)), - ("What is the meaning of life?", - SamplingParams(n=2, - best_of=5, - temperature=0.8, - top_p=0.95, - frequency_penalty=0.1)), - ("It is only with the heart that one can see rightly", - SamplingParams(n=3, best_of=3, use_beam_search=True, - temperature=0.0)), - ] - - -def process_requests(engine: LLMEngine, - test_prompts: List[Tuple[str, SamplingParams]]): - """Continuously process a list of prompts and handle the outputs.""" - request_id = 0 - - while test_prompts or engine.has_unfinished_requests(): - if test_prompts: - prompt, sampling_params = test_prompts.pop(0) - engine.add_request(str(request_id), prompt, sampling_params) - request_id += 1 - - request_outputs: List[RequestOutput] = engine.step() - - for request_output in request_outputs: - if request_output.finished: - print(request_output) - - -def initialize_engine(args: argparse.Namespace) -> LLMEngine: - """Initialize the LLMEngine from the command line arguments.""" - engine_args = EngineArgs.from_cli_args(args) - return LLMEngine.from_engine_args(engine_args) - - -def main(args: argparse.Namespace): - """Main function that sets up and runs the prompt processing.""" - engine = initialize_engine(args) - test_prompts = create_test_prompts() - process_requests(engine, test_prompts) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Demo on using the LLMEngine class directly') - parser = EngineArgs.add_cli_args(parser) - args = parser.parse_args() - main(args) diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py deleted file mode 100644 index 8fdd243af69ff0092d8ab582ab34150b8c87b18e..0000000000000000000000000000000000000000 --- a/examples/multilora_inference.py +++ /dev/null @@ -1,117 +0,0 @@ -""" -This example shows how to use the multi-LoRA functionality for offline inference. - -Requires HuggingFace credentials for access to Llama2. -""" - -from typing import Optional, List, Tuple - -from huggingface_hub import snapshot_download - -from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput -from vllm.lora.request import LoRARequest - - -def create_test_prompts(lora_path: str) -> List[Tuple[str, SamplingParams]]: - """Create a list of test prompts with their sampling parameters. - - 2 requests for base model, 4 requests for the LoRA. We define 2 - different LoRA adapters (using the same model for demo purposes). - Since we also set `max_loras=1`, the expectation is that the requests - with the second LoRA adapter will be ran after all requests with the - first adapter have finished. - """ - return [ - ("A robot may not injure a human being", - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128), None), - ("To be or not to be,", - SamplingParams(temperature=0.8, - top_k=5, - presence_penalty=0.2, - max_tokens=128), None), - ("[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora", 1, lora_path)), - ("[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", - SamplingParams(n=3, - best_of=3, - use_beam_search=True, - temperature=0, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora", 1, lora_path)), - ("[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora2", 2, lora_path)), - ("[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", - SamplingParams(n=3, - best_of=3, - use_beam_search=True, - temperature=0, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora", 1, lora_path)), - ] - - -def process_requests(engine: LLMEngine, - test_prompts: List[Tuple[str, SamplingParams, - Optional[LoRARequest]]]): - """Continuously process a list of prompts and handle the outputs.""" - request_id = 0 - - while test_prompts or engine.has_unfinished_requests(): - if test_prompts: - prompt, sampling_params, lora_request = test_prompts.pop(0) - engine.add_request(str(request_id), - prompt, - sampling_params, - lora_request=lora_request) - request_id += 1 - - request_outputs: List[RequestOutput] = engine.step() - - for request_output in request_outputs: - if request_output.finished: - print(request_output) - - -def initialize_engine() -> LLMEngine: - """Initialize the LLMEngine.""" - # max_loras: controls the number of LoRAs that can be used in the same - # batch. Larger numbers will cause higher memory usage, as each LoRA - # slot requires its own preallocated tensor. - # max_lora_rank: controls the maximum supported rank of all LoRAs. Larger - # numbers will cause higher memory usage. If you know that all LoRAs will - # use the same rank, it is recommended to set this as low as possible. - # max_cpu_loras: controls the size of the CPU LoRA cache. - engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf", - enable_lora=True, - max_loras=1, - max_lora_rank=8, - max_cpu_loras=2, - max_num_seqs=256) - return LLMEngine.from_engine_args(engine_args) - - -def main(): - """Main function that sets up and runs the prompt processing.""" - engine = initialize_engine() - lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") - test_prompts = create_test_prompts(lora_path) - process_requests(engine, test_prompts) - - -if __name__ == '__main__': - main() diff --git a/examples/offline_inference.py b/examples/offline_inference.py deleted file mode 100644 index 9b758fa2479f66ff3a192ca0c2b8df60f604642c..0000000000000000000000000000000000000000 --- a/examples/offline_inference.py +++ /dev/null @@ -1,22 +0,0 @@ -from vllm import LLM, SamplingParams - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -# Create an LLM. -llm = LLM(model="facebook/opt-125m") -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py deleted file mode 100644 index 8ccfb1ceea7313c2ab58630b3d016cd4369a05fa..0000000000000000000000000000000000000000 --- a/examples/offline_inference_with_prefix.py +++ /dev/null @@ -1,59 +0,0 @@ -from vllm import LLM, SamplingParams - -prefix = ( - "You are an expert school principal, skilled in effectively managing " - "faculty and staff. Draft 10-15 questions for a potential first grade " - "Head Teacher for my K-12, all-girls', independent school that emphasizes " - "community, joyful discovery, and life-long learning. The candidate is " - "coming in for a first-round panel interview for a 8th grade Math " - "teaching role. They have 5 years of previous teaching experience " - "as an assistant teacher at a co-ed, public school with experience " - "in middle school math teaching. Based on these information, fulfill " - "the following paragraph: ") - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.0) - -# Create an LLM. -llm = LLM(model="facebook/opt-125m") - -generating_prompts = [prefix + prompt for prompt in prompts] - -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(generating_prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -print("-" * 80) - -# -1 since the last token can change when concatenating prompts. -prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1 - -# The llm.generate call will batch all prompts and send the batch at once if resources allow. -# The prefix will only be cached after the first batch is processed, so we need to call generate once -# to calculate the prefix and cache it. -outputs = llm.generate(generating_prompts[0], - sampling_params, - prefix_pos=[prefix_pos]) - -# Subsequent batches can leverage the cached prefix -outputs = llm.generate(generating_prompts, - sampling_params, - prefix_pos=[prefix_pos] * len(generating_prompts)) - -# Print the outputs. You should see the same outputs as before -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/openai_chatcompletion_client.py b/examples/openai_chatcompletion_client.py deleted file mode 100644 index bbada3891bd199d7b1ff847f9a346518a5a74dea..0000000000000000000000000000000000000000 --- a/examples/openai_chatcompletion_client.py +++ /dev/null @@ -1,36 +0,0 @@ -from openai import OpenAI - -# Modify OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - -client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, -) - -models = client.models.list() -model = models.data[0].id - -chat_completion = client.chat.completions.create( - messages=[{ - "role": "system", - "content": "You are a helpful assistant." - }, { - "role": "user", - "content": "Who won the world series in 2020?" - }, { - "role": - "assistant", - "content": - "The Los Angeles Dodgers won the World Series in 2020." - }, { - "role": "user", - "content": "Where was it played?" - }], - model=model, -) - -print("Chat completion results:") -print(chat_completion) diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py deleted file mode 100644 index 58519f978d340a1af85581f478d2729999e5d26f..0000000000000000000000000000000000000000 --- a/examples/openai_completion_client.py +++ /dev/null @@ -1,31 +0,0 @@ -from openai import OpenAI - -# Modify OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - -client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, -) - -models = client.models.list() -model = models.data[0].id - -# Completion API -stream = False -completion = client.completions.create( - model=model, - prompt="A robot may not injure a human being", - echo=False, - n=2, - stream=stream, - logprobs=3) - -print("Completion results:") -if stream: - for c in completion: - print(c) -else: - print(completion) diff --git a/examples/template_alpaca.jinja b/examples/template_alpaca.jinja deleted file mode 100644 index 60667acc3ef962ced3fa0ee709a9c4af7979cbbd..0000000000000000000000000000000000000000 --- a/examples/template_alpaca.jinja +++ /dev/null @@ -1,29 +0,0 @@ -{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} - -{% for message in messages %} -{% if message['role'] == 'user' %} -### Instruction: -{{ message['content']|trim -}} -{% if not loop.last %} - - -{% endif %} -{% elif message['role'] == 'assistant' %} -### Response: -{{ message['content']|trim -}} -{% if not loop.last %} - - -{% endif %} -{% elif message['role'] == 'user_context' %} -### Input: -{{ message['content']|trim -}} -{% if not loop.last %} - - -{% endif %} -{% endif %} -{% endfor %} -{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} -### Response: -{% endif %} \ No newline at end of file diff --git a/examples/template_baichuan.jinja b/examples/template_baichuan.jinja deleted file mode 100644 index a1812a6c09ab127ffd7fbe60fb9617de90f292c7..0000000000000000000000000000000000000000 --- a/examples/template_baichuan.jinja +++ /dev/null @@ -1,22 +0,0 @@ -{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} - -{% for message in messages %} -{% if message['role'] == 'user' %} -<reserved_106> -{{ message['content']|trim -}} -{% if not loop.last %} - - -{% endif %} -{% elif message['role'] == 'assistant' %} -<reserved_107> -{{ message['content']|trim -}} -{% if not loop.last %} - - -{% endif %} -{% endif %} -{% endfor %} -{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} -<reserved_107> -{% endif %} \ No newline at end of file diff --git a/examples/template_chatml.jinja b/examples/template_chatml.jinja deleted file mode 100644 index 4844e681e1b6c8e4bda4cdd0f64122f96a45c195..0000000000000000000000000000000000000000 --- a/examples/template_chatml.jinja +++ /dev/null @@ -1,2 +0,0 @@ -{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} -{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} \ No newline at end of file diff --git a/examples/template_inkbot.jinja b/examples/template_inkbot.jinja deleted file mode 100644 index 33a817454df36f55fb34213d16d45536cb5dc476..0000000000000000000000000000000000000000 --- a/examples/template_inkbot.jinja +++ /dev/null @@ -1,30 +0,0 @@ -<#meta#> -- Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }} -- Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }} -<#system#> -{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} -<#chat#> -{% for message in messages %} -{% if message['role'] == 'user' %} -<#user#> -{{ message['content']|trim -}} -{% if not loop.last %} - -{% endif %} -{% elif message['role'] == 'assistant' %} -<#bot#> -{{ message['content']|trim -}} -{% if not loop.last %} - -{% endif %} -{% elif message['role'] == 'user_context' %} -<#user_context#> -{{ message['content']|trim -}} -{% if not loop.last %} - -{% endif %} -{% endif %} -{% endfor %} -{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} -<#bot#> -{% endif %} \ No newline at end of file diff --git a/format.sh b/format.sh deleted file mode 100755 index c78108869659d34644108f2c5464da368cd57772..0000000000000000000000000000000000000000 --- a/format.sh +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env bash -# YAPF formatter, adapted from ray and skypilot. -# -# Usage: -# # Do work and commit your work. - -# # Format files that differ from origin/main. -# bash format.sh - -# # Commit changed files with message 'Run yapf and ruff' -# -# -# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase. -# You are encouraged to run this locally before pushing changes for review. - -# Cause the script to exit if a single command fails -set -eo pipefail - -# this stops git rev-parse from failing if we run this from the .git directory -builtin cd "$(dirname "${BASH_SOURCE:-$0}")" -ROOT="$(git rev-parse --show-toplevel)" -builtin cd "$ROOT" || exit 1 - -YAPF_VERSION=$(yapf --version | awk '{print $2}') -RUFF_VERSION=$(ruff --version | awk '{print $2}') -MYPY_VERSION=$(mypy --version | awk '{print $2}') - -# # params: tool name, tool version, required version -tool_version_check() { - if [[ $2 != $3 ]]; then - echo "Wrong $1 version installed: $3 is required, not $2." - exit 1 - fi -} - -tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)" -tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)" -tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)" - -YAPF_FLAGS=( - '--recursive' - '--parallel' -) - -YAPF_EXCLUDES=( - '--exclude' 'build/**' -) - -# Format specified files -format() { - yapf --in-place "${YAPF_FLAGS[@]}" "$@" -} - -# Format files that differ from main branch. Ignores dirs that are not slated -# for autoformat yet. -format_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause yapf to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ - yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}" - fi - -} - -# Format all files -format_all() { - yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" . -} - -## This flag formats individual files. --files *must* be the first command line -## arg to use this option. -if [[ "$1" == '--files' ]]; then - format "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is formatted. -elif [[ "$1" == '--all' ]]; then - format_all -else - # Format only the files that changed in last commit. - format_changed -fi -echo 'vLLM yapf: Done' - -# Run mypy -# TODO(zhuohan): Enable mypy -# echo 'vLLM mypy:' -# mypy - -# Lint specified files -lint() { - ruff "$@" -} - -# Lint files that differ from main branch. Ignores dirs that are not slated -# for autolint yet. -lint_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - ruff - fi - -} - -# Run Ruff -echo 'vLLM Ruff:' -## This flag lints individual files. --files *must* be the first command line -## arg to use this option. -if [[ "$1" == '--files' ]]; then - lint "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - lint vllm tests -else - # Format only the files that changed in last commit. - lint_changed -fi - -if ! git diff --quiet &>/dev/null; then - echo 'Reformatted files. Please review and stage the changes.' - echo 'Changes not staged for commit:' - echo - git --no-pager diff --name-only - - exit 1 -fi diff --git a/mypy.ini b/mypy.ini deleted file mode 100644 index 55c4248ea9d26f437e51066b1febccb439a64bcc..0000000000000000000000000000000000000000 --- a/mypy.ini +++ /dev/null @@ -1,8 +0,0 @@ -[mypy] -python_version = 3.8 - -ignore_missing_imports = True - -files = vllm -# TODO(woosuk): Include the code from Megatron and HuggingFace. -exclude = vllm/model_executor/parallel_utils/|vllm/model_executor/models/ diff --git a/patch_xformers.rocm.sh b/patch_xformers.rocm.sh deleted file mode 100644 index de427b24d306f759c9c484dc6a70d03a107d4bcc..0000000000000000000000000000000000000000 --- a/patch_xformers.rocm.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -set -e - -XFORMERS_VERSION="0.0.23" - -export XFORMERS_INSTALLED_VERSION=$(python -c 'import xformers; print(xformers.__version__)') - -if [ "$XFORMERS_INSTALLED_VERSION" != "$XFORMERS_VERSION" ]; then - echo "ERROR: xformers version must be ${XFORMERS_VERSION}. ${XFORMERS_INSTALLED_VERSION} is installed" - exit 1 -fi - -export XFORMERS_FMHA_FLASH_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.flash.__file__)') -export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.common.__file__)') - -echo "XFORMERS_FMHA_FLASH_PATH = ${XFORMERS_FMHA_FLASH_PATH}" -echo "XFORMERS_FMHA_COMMON_PATH = ${XFORMERS_FMHA_COMMON_PATH}" - -if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then - echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}" - patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch" - echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}" -else - echo "${XFORMERS_FMHA_FLASH_PATH} was patched before" -fi - -if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then - echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}" - patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch" - echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}" -else - echo "${XFORMERS_FMHA_COMMON_PATH} was patched before" -fi diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index b197256f6ff55b1792e04386699cf96bbd505466..0000000000000000000000000000000000000000 --- a/pyproject.toml +++ /dev/null @@ -1,34 +0,0 @@ -[build-system] -# Should be mirrored in requirements-build.txt -requires = [ - "ninja", - "packaging", - "setuptools >= 49.4.0", - "torch == 2.1.2", - "wheel", -] -build-backend = "setuptools.build_meta" - -[tool.ruff.lint] -select = [ - # pycodestyle - "E", - # Pyflakes - "F", - # pyupgrade - # "UP", - # flake8-bugbear - "B", - # flake8-simplify - "SIM", - # isort - # "I", -] -ignore = [ - # star imports - "F405", "F403", - # lambda expression assignment - "E731", - # line too long, handled by black formatting - "E501", -] diff --git a/requirements-build.txt b/requirements-build.txt deleted file mode 100644 index 7e7e48a1313e50b261f1bfa533d8f3e72729b744..0000000000000000000000000000000000000000 --- a/requirements-build.txt +++ /dev/null @@ -1,6 +0,0 @@ -# Should be mirrored in pyproject.toml -ninja -packaging -setuptools>=49.4.0 -torch==2.1.2 -wheel \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index f8126008d07942b46f76b45c791fbf5058e248a8..0000000000000000000000000000000000000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,21 +0,0 @@ -# formatting -yapf==0.32.0 -toml==0.10.2 -ruff==0.1.5 - -# type checking -mypy==0.991 -types-PyYAML -types-requests -types-setuptools - -# testing -pytest -pytest-forked -pytest-asyncio -httpx -einops # required for MPT -flash_attn # required for HuggingFace's llama implementation -openai -requests -ray \ No newline at end of file diff --git a/requirements-neuron.txt b/requirements-neuron.txt deleted file mode 100644 index 3f30ed08f037de1d96d9c5aff8fcbbf7e315eeb3..0000000000000000000000000000000000000000 --- a/requirements-neuron.txt +++ /dev/null @@ -1,9 +0,0 @@ -sentencepiece # Required for LLaMA tokenizer. -numpy -transformers-neuronx >= 0.9.0 -torch-neuronx >= 2.1.0 -neuronx-cc -fastapi -uvicorn[standard] -pydantic >= 2.0 # Required for OpenAI server. -aioprometheus[starlette] diff --git a/requirements-rocm.txt b/requirements-rocm.txt deleted file mode 100644 index 7b42ee067310bde063efd1e02ebe43543286eeae..0000000000000000000000000000000000000000 --- a/requirements-rocm.txt +++ /dev/null @@ -1,13 +0,0 @@ -ninja # For faster builds. -typing-extensions>=4.8.0 -starlette -psutil -ray >= 2.9 -sentencepiece # Required for LLaMA tokenizer. -numpy -tokenizers>=0.15.0 -transformers >= 4.37.0 # Required for Mixtral. -fastapi -uvicorn[standard] -pydantic >= 2.0 # Required for OpenAI server. -aioprometheus[starlette] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 2bf527ccc3a777678daabedbf0f1189071cca71c..0000000000000000000000000000000000000000 --- a/requirements.txt +++ /dev/null @@ -1,13 +0,0 @@ -ninja # For faster builds. -psutil -ray >= 2.9 -sentencepiece # Required for LLaMA tokenizer. -numpy -torch == 2.1.2 -transformers >= 4.37.0 # Required for Qwen2 -xformers == 0.0.23.post1 # Required for CUDA 12.1. -fastapi -uvicorn[standard] -pydantic >= 2.0 # Required for OpenAI server. -aioprometheus[starlette] -pynvml == 11.5.0 diff --git a/rocm_patch/commonpy_xformers-0.0.23.rocm.patch b/rocm_patch/commonpy_xformers-0.0.23.rocm.patch deleted file mode 100644 index 4d7495cf13e1d3a64507a64c1835d751b2df56ac..0000000000000000000000000000000000000000 --- a/rocm_patch/commonpy_xformers-0.0.23.rocm.patch +++ /dev/null @@ -1,13 +0,0 @@ ---- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/common.py 2023-11-29 03:17:03.930103539 +0000 -+++ common.py 2023-11-28 16:14:19.846233146 +0000 -@@ -298,8 +298,8 @@ - dtype = d.query.dtype - if device_type not in cls.SUPPORTED_DEVICES: - reasons.append(f"device={device_type} (supported: {cls.SUPPORTED_DEVICES})") -- if device_type == "cuda" and not _built_with_cuda: -- reasons.append("xFormers wasn't build with CUDA support") -+ #if device_type == "cuda" and not _built_with_cuda: -+ # reasons.append("xFormers wasn't build with CUDA support") - if device_type == "cuda": - device_capability = torch.cuda.get_device_capability(d.device) - if device_capability < cls.CUDA_MINIMUM_COMPUTE_CAPABILITY: diff --git a/rocm_patch/flashpy_xformers-0.0.23.rocm.patch b/rocm_patch/flashpy_xformers-0.0.23.rocm.patch deleted file mode 100644 index ac846728a7a918f58da761f1f8b56709881b2713..0000000000000000000000000000000000000000 --- a/rocm_patch/flashpy_xformers-0.0.23.rocm.patch +++ /dev/null @@ -1,152 +0,0 @@ ---- flash_ori.py 2023-12-13 05:43:31.530752623 +0000 -+++ flash_patch.py 2023-12-13 06:00:45.962403104 +0000 -@@ -36,44 +36,44 @@ - - FLASH_VERSION = "0.0.0" - try: -- try: -- from ... import _C_flashattention # type: ignore[attr-defined] -- from ..._cpp_lib import _build_metadata -- -- if _build_metadata is not None: -- FLASH_VERSION = _build_metadata.flash_version -- except ImportError: -- import flash_attn -- from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention -- -- FLASH_VERSION = flash_attn.__version__ -- flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3]) -- if ( -- flash_ver_parsed != (2, 3, 6) -- and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1" -- ): -- raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api") -+ #try: -+ # from ... import _C_flashattention # type: ignore[attr-defined] -+ # from ..._cpp_lib import _build_metadata -+ -+ # if _build_metadata is not None: -+ # FLASH_VERSION = _build_metadata.flash_version -+ #except ImportError: -+ import flash_attn -+ from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention -+ -+ FLASH_VERSION = flash_attn.__version__ -+ # flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3]) -+ # if ( -+ # flash_ver_parsed != (2, 3, 6) -+ # and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1" -+ # ): -+ # raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api") - - # create library so that flash-attn goes through the PyTorch Dispatcher -- _flash_lib = torch.library.Library("xformers_flash", "DEF") -- -- _flash_lib.define( -- "flash_fwd(Tensor query, Tensor key, Tensor value, " -- "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, " -- "int max_seqlen_q, int max_seqlen_k, " -- "float p, float softmax_scale, " -- "bool is_causal, int window_left, " -- "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)" -- ) -+ #_flash_lib = torch.library.Library("xformers_flash", "DEF") - -- _flash_lib.define( -- "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, " -- "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, " -- "Tensor cu_seqlens_q, Tensor cu_seqlens_k, " -- "int max_seqlen_q, int max_seqlen_k, " -- "float p, float softmax_scale, bool is_causal, " -- "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)" -- ) -+ #_flash_lib.define( -+ # "flash_fwd(Tensor query, Tensor key, Tensor value, " -+ # "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, " -+ # "int max_seqlen_q, int max_seqlen_k, " -+ # "float p, float softmax_scale, " -+ # "bool is_causal, int window_left, " -+ # "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)" -+ #) -+ -+ #_flash_lib.define( -+ # "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, " -+ # "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, " -+ # "Tensor cu_seqlens_q, Tensor cu_seqlens_k, " -+ # "int max_seqlen_q, int max_seqlen_k, " -+ # "float p, float softmax_scale, bool is_causal, " -+ # "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)" -+ #) - - def _flash_fwd( - query, -@@ -111,8 +111,8 @@ - p, - softmax_scale, - is_causal, -- window_left, # window_size_left -- window_right, # window_size_right -+ # window_left, # window_size_left -+ # window_right, # window_size_right - return_softmax, - None, # rng - ) -@@ -134,15 +134,15 @@ - out, - cu_seq_lens_q, - cu_seq_lens_k, -- seqused_k, -+ # seqused_k, - max_seq_len_q, - max_seq_len_k, - p, - softmax_scale, - False, - is_causal, -- window_left, -- window_right, -+ # window_left, -+ # window_right, - return_softmax, - None, - ) -@@ -184,8 +184,8 @@ - p, - softmax_scale, - is_causal, -- window_left, -- window_right, -+ # window_left, -+ # window_right, - None, - rng_state, - ) -@@ -208,15 +208,15 @@ - softmax_scale, - False, # zero_tensors - is_causal, -- window_left, -- window_right, -+ # window_left, -+ # window_right, - None, - rng_state, - ) - return dq, dk, dv - -- _flash_lib.impl("flash_fwd", _flash_fwd, "CUDA") -- _flash_lib.impl("flash_bwd", _flash_bwd, "CUDA") -+ #_flash_lib.impl("flash_fwd", _flash_fwd, "CUDA") -+ #_flash_lib.impl("flash_bwd", _flash_bwd, "CUDA") - except ImportError: - pass - -@@ -400,7 +400,7 @@ - implementation. - """ - -- OPERATOR = get_operator("xformers_flash", "flash_fwd") -+ OPERATOR = _flash_fwd # get_operator("xformers_flash", "flash_fwd") - SUPPORTED_DEVICES: Set[str] = {"cuda"} - CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0) - SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16} diff --git a/server.csr b/server.csr new file mode 100644 index 0000000000000000000000000000000000000000..38dc2c102937c0693cba4b9235e9d748f6375b44 --- /dev/null +++ b/server.csr @@ -0,0 +1,18 @@ +-----BEGIN CERTIFICATE REQUEST----- +MIICzzCCAbcCAQAwgYkxCzAJBgNVBAYTAlVTMRcwFQYDVQQIDA5Tb3V0aCBDYXJv +bGluYTETMBEGA1UEBwwKQ2hhcmxlc3RvbjERMA8GA1UECgwIUGVyc29uYWwxFTAT +BgNVBAMMDEFQSSBFbmRwb2ludDEiMCAGCSqGSIb3DQEJARYTYnNtaXQxNjU5QGdt +YWlsLmNvbTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBALvBXOWhiijs +Cwr9RuhBLv9zOu3U06j+PDBooKavhzKkKmUneOsufQdSfVLpul8zToWk9htngD2Z +zfyTsRjTg6wYMFXh//bxpqp2T5bYjH99lrhukCiN5grDmSSBjT4RFQUpt72MT6Uh +Dm+I17F/f7zwPG4l795CQyUaDL6yQzZmQIGrwKDsFkyUJXie6HR6rNRXakJwTU4w +asrNrVx94BuaCWWnNNfTmFwCrRGCO8mmSXU/hGTL7e5Po82JCjFdvVpw5ZSkP0oe +aS+KnAEYPkVqWUGP2Q++pA6DXe6OkVflGJBBL4sftIxoCMBHpSen/NbNS9/tPBaa +fYvqZTHMkkMCAwEAAaAAMA0GCSqGSIb3DQEBCwUAA4IBAQA7CaqC3ydIaSnVebz7 +ciDv6RVgYJ5gxDp+BRk5Cbfoevs+ChP8zFOStsGL8jbk1VrNtGJZzC0FCHbMtuY3 +lEg1+ZMpi8P2SMkMPjCSc5PbxB0Xp4y5ACyqVb/UMSI0r3ANbSirTkxysqxFSYd7 ++xD644OVpCdVDC2JMp21P0EV/Qq8MZg8ZiUlO/FWW1+1OsHQ+gqG9vMofoNxbsM2 +uKm7J1vmK/4n4raXb9BtSWoPUxVgag8giv2zkaV12ZKWsDP2wQzGi5qjrsJ62Fpt +YuPibZceaRufo4ntvYPYc5bgIfkXbJSJ1Fp9xoiZG49sU2dJhGJuWFkdlosmUBCz +xSoW +-----END CERTIFICATE REQUEST----- diff --git a/server.key b/server.key new file mode 100644 index 0000000000000000000000000000000000000000..4e8b2aff3198f6de22cdde9b1ca2fd7979f5c99c --- /dev/null +++ b/server.key @@ -0,0 +1,30 @@ +-----BEGIN RSA PRIVATE KEY----- +Proc-Type: 4,ENCRYPTED +DEK-Info: DES-EDE3-CBC,C7A79FD3DBB72768 + +cnLPOETFSBIO+bkqlHyINtceS9JYnqB2kNHHZv/hgMad76jaVH7g3IcUPBuA/5jP +jxILJy+2n1YYm/EWy6tG1CznScIs0CgixmAwC8+ql9OQSAa/3KqECcLSKTUso/VN +b9rINVgXtxyUmSA32rS8mqMW75U7grB0SpKvFoPn0EDPff+ohjFmApAPW1XPdKl+ +rFwxhoMGFiR2p1mEjAmA6rXuKy+rwyQ4WjjNZQvPQZJsUMDF7SCy4xMIKUwns5op +6zc+DrgurovkZthXsbnvMd23onlEQtjFw0p25Ri4sg3YB/qbShGvouydoCcXRFKI +R5fjW1J+jU6g6sQSzm68ss0pwqYc1WYfKzhDbkrgjLN+0t+pYyxO3N2VmOSoweFl +TyH60yodjzBFDWWjxEMk7EjvhOcoIM+1pk2sXtsynMpjEFmKsZ0S3KB2x+0tvHMB +Qco2tYe/4VwkF7kIr7NHuVxFgIHLefTFgpER2yvOXrumSsjn6+bnwNjk9beasL9/ +FghW6Vuo/8I6i+RyNLJBUE+ngsTuOu9ueTPqEex1b7hj9N7dLvZsBYTq3PdAYiB0 +/e0zE07dzwv0lANkUYQEAqPF7LBek9Ff+pxB3yglIvVbW6QZ4Pv0gwfJzx9FQafR +LOwnmJad3xDPc2OHFbyNgpYq7mI9EGlfSEU+S0l0qClV0Zj9iL6H4whc36/FRNnm +3Zu8yzjEkhvS7yQqgOqIZ4z2/B/w5ucuxtSWt0iI738rUGXFlXFmTEyYyL0MgmWk +WvoF16BcyHOZO8UjRxtucTYe0kP93ZpVp09jkG0cow56hrRAuBlxauvZo1nlCSDO +MWqHCxEdWZOfQ14tosXqxkmHWf92QTNUZmatWFgeeqykfAWaIO08xaHZttrjK5kw +3oRe/uvO4raca6FzLbARxYy24T5dVxHm0STwdRIsiAZ1wz3Z49ZWfekKdXk/fbBK +DCn1yNxPM+KNdtS2tHKrSkpCSsUIzDBGViZPiSoH0imxfVkALqQ7jKvSTsn0jSW5 +2EQn52OzURGq0IuD0Y/uIAMeblnSTsIi/exNfbSx9ryaL8T5GSUVCg6WIkJHgwXq +CCQMGsb76XEMoko8ShE34kLWg9MePDU0xg/esDlT7qcGmeXrijjaWmmV/L+1SMSZ +2pGhj2fyBSTbR2pdkjJE3O+5y+3jMjrDWjVWMWCzHMA7Ctgrpi70wiYEW07TPEno +LtvwiOuYdjBN7coO6cdSBQkh6ZIeqyvj9/z2mNJ8T7dKMNA0cI/H46MUwmYexe61 +3iB5SbacJ8XsBEPXpUp2tmLgNEFOeYlQ9DfrH5uJ5mRPm++wgMR27gO8o3m+bMwD +y3adTZQaBQo4vuYH/Zq6JKLbMAmBG1k5xyh5i9+z7YFOAXmBQMLHMPJXVXAY8iRk +JKbaWufXHR+qnUxTfz3Cx09FayUFAbjyIehsV21o938tYkQ4rTwKSPKGPl2+ppQr +r6aa+FOJTw6jIGutK+NFBCUeKf75h/jMBsQgvFDa/vuaS5vh3AqaSK/C+JvnTpCJ +XmLN4WLWP+jSm9sTJQOzMb5aQrsopNx9nqrVVfA/Z3p+Rt6UNN6MeQ== +-----END RSA PRIVATE KEY----- diff --git a/setup.py b/setup.py deleted file mode 100644 index 3e2127855a7550e1ae97dd28ce65902a62ebbcf0..0000000000000000000000000000000000000000 --- a/setup.py +++ /dev/null @@ -1,431 +0,0 @@ -import contextlib -import io -import os -import re -import subprocess -import warnings -from pathlib import Path -from typing import List, Set - -from packaging.version import parse, Version -import setuptools -import torch -import torch.utils.cpp_extension as torch_cpp_ext -from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME - -ROOT_DIR = os.path.dirname(__file__) - -MAIN_CUDA_VERSION = "12.1" - -# Supported NVIDIA GPU architectures. -NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} -ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"} -# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) - - -def _is_hip() -> bool: - return torch.version.hip is not None - - -def _is_neuron() -> bool: - torch_neuronx_installed = True - try: - subprocess.run(["neuron-ls"], capture_output=True, check=True) - except FileNotFoundError: - torch_neuronx_installed = False - return torch_neuronx_installed - - -def _is_cuda() -> bool: - return (torch.version.cuda is not None) and not _is_neuron() - - -# Compiler flags. -CXX_FLAGS = ["-g", "-O2", "-std=c++17"] -# TODO(woosuk): Should we use -O3? -NVCC_FLAGS = ["-O2", "-std=c++17"] - -if _is_hip(): - if ROCM_HOME is None: - raise RuntimeError( - "Cannot find ROCM_HOME. ROCm must be available to build the package." - ) - NVCC_FLAGS += ["-DUSE_ROCM"] - NVCC_FLAGS += ["-U__HIP_NO_HALF_CONVERSIONS__"] - NVCC_FLAGS += ["-U__HIP_NO_HALF_OPERATORS__"] - -if _is_cuda() and CUDA_HOME is None: - raise RuntimeError( - "Cannot find CUDA_HOME. CUDA must be available to build the package.") - -ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0 -CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] -NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] - - -def get_amdgpu_offload_arch(): - command = "/opt/rocm/llvm/bin/amdgpu-offload-arch" - try: - output = subprocess.check_output([command]) - return output.decode('utf-8').strip() - except subprocess.CalledProcessError as e: - error_message = f"Error: {e}" - raise RuntimeError(error_message) from e - except FileNotFoundError as e: - # If the command is not found, print an error message - error_message = f"The command {command} was not found." - raise RuntimeError(error_message) from e - - return None - - -def get_hipcc_rocm_version(): - # Run the hipcc --version command - result = subprocess.run(['hipcc', '--version'], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True) - - # Check if the command was executed successfully - if result.returncode != 0: - print("Error running 'hipcc --version'") - return None - - # Extract the version using a regular expression - match = re.search(r'HIP version: (\S+)', result.stdout) - if match: - # Return the version string - return match.group(1) - else: - print("Could not find HIP version in the output") - return None - - -def glob(pattern: str): - root = Path(__name__).parent - return [str(p) for p in root.glob(pattern)] - - -def get_neuronxcc_version(): - import sysconfig - site_dir = sysconfig.get_paths()["purelib"] - version_file = os.path.join(site_dir, "neuronxcc", "version", - "__init__.py") - - # Check if the command was executed successfully - with open(version_file, "rt") as fp: - content = fp.read() - - # Extract the version using a regular expression - match = re.search(r"__version__ = '(\S+)'", content) - if match: - # Return the version string - return match.group(1) - else: - raise RuntimeError("Could not find HIP version in the output") - - -def get_nvcc_cuda_version(cuda_dir: str) -> Version: - """Get the CUDA version from nvcc. - - Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py - """ - nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], - universal_newlines=True) - output = nvcc_output.split() - release_idx = output.index("release") + 1 - nvcc_cuda_version = parse(output[release_idx].split(",")[0]) - return nvcc_cuda_version - - -def get_torch_arch_list() -> Set[str]: - # TORCH_CUDA_ARCH_LIST can have one or more architectures, - # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the - # compiler to additionally include PTX code that can be runtime-compiled - # and executed on the 8.6 or newer architectures. While the PTX code will - # not give the best performance on the newer architectures, it provides - # forward compatibility. - env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None) - if env_arch_list is None: - return set() - - # List are separated by ; or space. - torch_arch_list = set(env_arch_list.replace(" ", ";").split(";")) - if not torch_arch_list: - return set() - - # Filter out the invalid architectures and print a warning. - valid_archs = NVIDIA_SUPPORTED_ARCHS.union( - {s + "+PTX" - for s in NVIDIA_SUPPORTED_ARCHS}) - arch_list = torch_arch_list.intersection(valid_archs) - # If none of the specified architectures are valid, raise an error. - if not arch_list: - raise RuntimeError( - "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env " - f"variable ({env_arch_list}) is supported. " - f"Supported CUDA/ROCM architectures are: {valid_archs}.") - invalid_arch_list = torch_arch_list - valid_archs - if invalid_arch_list: - warnings.warn( - f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are " - "excluded from the `TORCH_CUDA_ARCH_LIST` env variable " - f"({env_arch_list}). Supported CUDA/ROCM architectures are: " - f"{valid_archs}.", - stacklevel=2) - return arch_list - - -# First, check the TORCH_CUDA_ARCH_LIST environment variable. -compute_capabilities = get_torch_arch_list() -if _is_cuda() and not compute_capabilities: - # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available - # GPUs on the current machine. - device_count = torch.cuda.device_count() - for i in range(device_count): - major, minor = torch.cuda.get_device_capability(i) - if major < 7: - raise RuntimeError( - "GPUs with compute capability below 7.0 are not supported.") - compute_capabilities.add(f"{major}.{minor}") - -ext_modules = [] - -if _is_cuda(): - nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME) - if not compute_capabilities: - # If no GPU is specified nor available, add all supported architectures - # based on the NVCC CUDA version. - compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy() - if nvcc_cuda_version < Version("11.1"): - compute_capabilities.remove("8.6") - if nvcc_cuda_version < Version("11.8"): - compute_capabilities.remove("8.9") - compute_capabilities.remove("9.0") - # Validate the NVCC CUDA version. - if nvcc_cuda_version < Version("11.0"): - raise RuntimeError( - "CUDA 11.0 or higher is required to build the package.") - if (nvcc_cuda_version < Version("11.1") - and any(cc.startswith("8.6") for cc in compute_capabilities)): - raise RuntimeError( - "CUDA 11.1 or higher is required for compute capability 8.6.") - if nvcc_cuda_version < Version("11.8"): - if any(cc.startswith("8.9") for cc in compute_capabilities): - # CUDA 11.8 is required to generate the code targeting compute capability 8.9. - # However, GPUs with compute capability 8.9 can also run the code generated by - # the previous versions of CUDA 11 and targeting compute capability 8.0. - # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0 - # instead of 8.9. - warnings.warn( - "CUDA 11.8 or higher is required for compute capability 8.9. " - "Targeting compute capability 8.0 instead.", - stacklevel=2) - compute_capabilities = set(cc for cc in compute_capabilities - if not cc.startswith("8.9")) - compute_capabilities.add("8.0+PTX") - if any(cc.startswith("9.0") for cc in compute_capabilities): - raise RuntimeError( - "CUDA 11.8 or higher is required for compute capability 9.0.") - - NVCC_FLAGS_PUNICA = NVCC_FLAGS.copy() - - # Add target compute capabilities to NVCC flags. - for capability in compute_capabilities: - num = capability[0] + capability[2] - NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"] - if capability.endswith("+PTX"): - NVCC_FLAGS += [ - "-gencode", f"arch=compute_{num},code=compute_{num}" - ] - if int(capability[0]) >= 8: - NVCC_FLAGS_PUNICA += [ - "-gencode", f"arch=compute_{num},code=sm_{num}" - ] - if capability.endswith("+PTX"): - NVCC_FLAGS_PUNICA += [ - "-gencode", f"arch=compute_{num},code=compute_{num}" - ] - - # Use NVCC threads to parallelize the build. - if nvcc_cuda_version >= Version("11.2"): - nvcc_threads = int(os.getenv("NVCC_THREADS", 8)) - num_threads = min(os.cpu_count(), nvcc_threads) - NVCC_FLAGS += ["--threads", str(num_threads)] - - if nvcc_cuda_version >= Version("11.8"): - NVCC_FLAGS += ["-DENABLE_FP8_E5M2"] - - # changes for punica kernels - NVCC_FLAGS += torch_cpp_ext.COMMON_NVCC_FLAGS - REMOVE_NVCC_FLAGS = [ - '-D__CUDA_NO_HALF_OPERATORS__', - '-D__CUDA_NO_HALF_CONVERSIONS__', - '-D__CUDA_NO_BFLOAT16_CONVERSIONS__', - '-D__CUDA_NO_HALF2_OPERATORS__', - ] - for flag in REMOVE_NVCC_FLAGS: - with contextlib.suppress(ValueError): - torch_cpp_ext.COMMON_NVCC_FLAGS.remove(flag) - - install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))) - device_count = torch.cuda.device_count() - for i in range(device_count): - major, minor = torch.cuda.get_device_capability(i) - if major < 8: - install_punica = False - break - if install_punica: - ext_modules.append( - CUDAExtension( - name="vllm._punica_C", - sources=["csrc/punica/punica_ops.cc"] + - glob("csrc/punica/bgmv/*.cu"), - extra_compile_args={ - "cxx": CXX_FLAGS, - "nvcc": NVCC_FLAGS_PUNICA, - }, - )) -elif _is_hip(): - amd_archs = os.getenv("GPU_ARCHS") - if amd_archs is None: - amd_archs = get_amdgpu_offload_arch() - for arch in amd_archs.split(";"): - if arch not in ROCM_SUPPORTED_ARCHS: - raise RuntimeError( - f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}" - f"amdgpu_arch_found: {arch}") - NVCC_FLAGS += [f"--offload-arch={arch}"] - -elif _is_neuron(): - neuronxcc_version = get_neuronxcc_version() - -vllm_extension_sources = [ - "csrc/cache_kernels.cu", - "csrc/attention/attention_kernels.cu", - "csrc/pos_encoding_kernels.cu", - "csrc/activation_kernels.cu", - "csrc/layernorm_kernels.cu", - "csrc/quantization/squeezellm/quant_cuda_kernel.cu", - "csrc/quantization/gptq/q_gemm.cu", - "csrc/cuda_utils_kernels.cu", - "csrc/moe_align_block_size_kernels.cu", - "csrc/pybind.cpp", -] - -if _is_cuda(): - vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu") - vllm_extension_sources.append("csrc/custom_all_reduce.cu") - -if not _is_neuron(): - vllm_extension = CUDAExtension( - name="vllm._C", - sources=vllm_extension_sources, - extra_compile_args={ - "cxx": CXX_FLAGS, - "nvcc": NVCC_FLAGS, - }, - libraries=["cuda"] if _is_cuda() else [], - ) - ext_modules.append(vllm_extension) - - -def get_path(*filepath) -> str: - return os.path.join(ROOT_DIR, *filepath) - - -def find_version(filepath: str) -> str: - """Extract version information from the given filepath. - - Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py - """ - with open(filepath) as fp: - version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", - fp.read(), re.M) - if version_match: - return version_match.group(1) - raise RuntimeError("Unable to find version string.") - - -def get_vllm_version() -> str: - version = find_version(get_path("vllm", "__init__.py")) - - if _is_hip(): - # Get the HIP version - hipcc_version = get_hipcc_rocm_version() - if hipcc_version != MAIN_CUDA_VERSION: - rocm_version_str = hipcc_version.replace(".", "")[:3] - version += f"+rocm{rocm_version_str}" - elif _is_neuron(): - # Get the Neuron version - neuron_version = str(neuronxcc_version) - if neuron_version != MAIN_CUDA_VERSION: - neuron_version_str = neuron_version.replace(".", "")[:3] - version += f"+neuron{neuron_version_str}" - else: - cuda_version = str(nvcc_cuda_version) - if cuda_version != MAIN_CUDA_VERSION: - cuda_version_str = cuda_version.replace(".", "")[:3] - version += f"+cu{cuda_version_str}" - - return version - - -def read_readme() -> str: - """Read the README file if present.""" - p = get_path("README.md") - if os.path.isfile(p): - return io.open(get_path("README.md"), "r", encoding="utf-8").read() - else: - return "" - - -def get_requirements() -> List[str]: - """Get Python package dependencies from requirements.txt.""" - if _is_hip(): - with open(get_path("requirements-rocm.txt")) as f: - requirements = f.read().strip().split("\n") - elif _is_neuron(): - with open(get_path("requirements-neuron.txt")) as f: - requirements = f.read().strip().split("\n") - else: - with open(get_path("requirements.txt")) as f: - requirements = f.read().strip().split("\n") - return requirements - - -package_data = {"vllm": ["py.typed"]} -if os.environ.get("VLLM_USE_PRECOMPILED"): - ext_modules = [] - package_data["vllm"].append("*.so") - -setuptools.setup( - name="vllm", - version=get_vllm_version(), - author="vLLM Team", - license="Apache 2.0", - description=("A high-throughput and memory-efficient inference and " - "serving engine for LLMs"), - long_description=read_readme(), - long_description_content_type="text/markdown", - url="https://github.com/vllm-project/vllm", - project_urls={ - "Homepage": "https://github.com/vllm-project/vllm", - "Documentation": "https://vllm.readthedocs.io/en/latest/", - }, - classifiers=[ - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "License :: OSI Approved :: Apache Software License", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - ], - packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs", - "examples", "tests")), - python_requires=">=3.8", - install_requires=get_requirements(), - ext_modules=ext_modules, - cmdclass={"build_ext": BuildExtension} if not _is_neuron() else {}, - package_data=package_data, -) diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py deleted file mode 100644 index 1be76fdc8d868e288f218c1fb015665e8d78aa13..0000000000000000000000000000000000000000 --- a/tests/async_engine/api_server_async_engine.py +++ /dev/null @@ -1,50 +0,0 @@ -"""vllm.entrypoints.api_server with some extra logging for testing.""" -import argparse -from typing import Any, Dict - -import uvicorn -from fastapi.responses import JSONResponse, Response - -import vllm.entrypoints.api_server -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine - -app = vllm.entrypoints.api_server.app - - -class AsyncLLMEngineWithStats(AsyncLLMEngine): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._num_aborts = 0 - - async def abort(self, request_id: str) -> None: - await super().abort(request_id) - self._num_aborts += 1 - - def testing_stats(self) -> Dict[str, Any]: - return {"num_aborted_requests": self._num_aborts} - - -@app.get("/stats") -def stats() -> Response: - """Get the statistics of the engine.""" - return JSONResponse(engine.testing_stats()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--host", type=str, default="localhost") - parser.add_argument("--port", type=int, default=8000) - parser = AsyncEngineArgs.add_cli_args(parser) - args = parser.parse_args() - - engine_args = AsyncEngineArgs.from_cli_args(args) - engine = AsyncLLMEngineWithStats.from_engine_args(engine_args) - vllm.entrypoints.api_server.engine = engine - uvicorn.run( - app, - host=args.host, - port=args.port, - log_level="debug", - timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE) diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py deleted file mode 100644 index ed9017c1e3e9dee1a87d0c71ecd466524522eb38..0000000000000000000000000000000000000000 --- a/tests/async_engine/test_api_server.py +++ /dev/null @@ -1,101 +0,0 @@ -import subprocess -import sys -import time -from multiprocessing import Pool -from pathlib import Path - -import pytest -import requests - - -def _query_server(prompt: str, max_tokens: int = 5) -> dict: - response = requests.post("http://localhost:8000/generate", - json={ - "prompt": prompt, - "max_tokens": max_tokens, - "temperature": 0, - "ignore_eos": True - }) - response.raise_for_status() - return response.json() - - -def _query_server_long(prompt: str) -> dict: - return _query_server(prompt, max_tokens=500) - - -@pytest.fixture -def api_server(): - script_path = Path(__file__).parent.joinpath( - "api_server_async_engine.py").absolute() - uvicorn_process = subprocess.Popen([ - sys.executable, - "-u", - str(script_path), - "--model", - "facebook/opt-125m", - "--host", - "127.0.0.1", - ]) - yield - uvicorn_process.terminate() - - -def test_api_server(api_server): - """ - Run the API server and test it. - - We run both the server and requests in separate processes. - - We test that the server can handle incoming requests, including - multiple requests at the same time, and that it can handle requests - being cancelled without crashing. - """ - with Pool(32) as pool: - # Wait until the server is ready - prompts = ["warm up"] * 1 - result = None - while not result: - try: - for r in pool.map(_query_server, prompts): - result = r - break - except requests.exceptions.ConnectionError: - time.sleep(1) - - # Actual tests start here - # Try with 1 prompt - for result in pool.map(_query_server, prompts): - assert result - - num_aborted_requests = requests.get( - "http://localhost:8000/stats").json()["num_aborted_requests"] - assert num_aborted_requests == 0 - - # Try with 100 prompts - prompts = ["test prompt"] * 100 - for result in pool.map(_query_server, prompts): - assert result - - with Pool(32) as pool: - # Cancel requests - prompts = ["canceled requests"] * 100 - pool.map_async(_query_server_long, prompts) - time.sleep(0.01) - pool.terminate() - pool.join() - - # check cancellation stats - # give it some times to update the stats - time.sleep(1) - - num_aborted_requests = requests.get( - "http://localhost:8000/stats").json()["num_aborted_requests"] - assert num_aborted_requests > 0 - - # check that server still runs after cancellations - with Pool(32) as pool: - # Try with 100 prompts - prompts = ["test prompt after canceled"] * 100 - for result in pool.map(_query_server, prompts): - assert result diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py deleted file mode 100644 index 1edb19c550010c93784b5a6e3298fc147fd9947b..0000000000000000000000000000000000000000 --- a/tests/async_engine/test_async_llm_engine.py +++ /dev/null @@ -1,91 +0,0 @@ -import asyncio -from dataclasses import dataclass - -import pytest - -from vllm.engine.async_llm_engine import AsyncLLMEngine - - -@dataclass -class RequestOutput: - request_id: int - finished: bool = False - - -class MockEngine: - - def __init__(self): - self.step_calls = 0 - self.add_request_calls = 0 - self.abort_request_calls = 0 - self.request_id = None - - async def step_async(self): - self.step_calls += 1 - return [RequestOutput( - request_id=self.request_id)] if self.request_id else [] - - async def encode_request_async( - self, - *args, - **kwargs, - ): - return [1] - - def generate(self, request_id): - self.request_id = request_id - - def stop_generating(self): - self.request_id = None - - def add_request(self, **kwargs): - del kwargs # Unused - self.add_request_calls += 1 - - async def add_request_async(self, **kwargs): - del kwargs # Unused - self.add_request_calls += 1 - - def abort_request(self, request_id): - del request_id # Unused - self.abort_request_calls += 1 - - -class MockAsyncLLMEngine(AsyncLLMEngine): - - def _init_engine(self, *args, **kwargs): - return MockEngine() - - -@pytest.mark.asyncio -async def test_new_requests_event(): - engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False) - engine.start_background_loop() - await asyncio.sleep(0.01) - assert engine.engine.step_calls == 0 - - await engine.add_request("1", "", None) - await asyncio.sleep(0.01) - assert engine.engine.add_request_calls == 1 - assert engine.engine.step_calls == 1 - - await engine.add_request("2", "", None) - engine.engine.generate("2") - await asyncio.sleep(0) - assert engine.engine.add_request_calls == 2 - assert engine.engine.step_calls == 2 - await asyncio.sleep(0) - assert engine.engine.step_calls == 3 - engine.engine.stop_generating() - await asyncio.sleep(0) - assert engine.engine.step_calls == 4 - await asyncio.sleep(0) - assert engine.engine.step_calls == 4 - - await engine.add_request("3", "", None) - await asyncio.sleep(0.01) - assert engine.engine.add_request_calls == 3 - assert engine.engine.step_calls == 5 - await asyncio.sleep(0.01) - assert engine.engine.add_request_calls == 3 - assert engine.engine.step_calls == 5 diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py deleted file mode 100644 index 32d110e0f0b47b198ccb0926562956a677d07a31..0000000000000000000000000000000000000000 --- a/tests/async_engine/test_chat_template.py +++ /dev/null @@ -1,120 +0,0 @@ -from dataclasses import dataclass -import os -import pathlib - -import pytest - -from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat -from vllm.entrypoints.openai.protocol import ChatCompletionRequest - -chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath( - __file__))).parent.parent / "examples/template_chatml.jinja" -assert chatml_jinja_path.exists() - -# Define models, templates, and their corresponding expected outputs -MODEL_TEMPLATE_GENERATON_OUTPUT = [ - ("facebook/opt-125m", None, True, - "Hello</s>Hi there!</s>What is the capital of</s>"), - ("facebook/opt-125m", None, False, - "Hello</s>Hi there!</s>What is the capital of</s>"), - ("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user -Hello<|im_end|> -<|im_start|>assistant -Hi there!<|im_end|> -<|im_start|>user -What is the capital of<|im_end|> -<|im_start|>assistant -"""), - ("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user -Hello<|im_end|> -<|im_start|>assistant -Hi there!<|im_end|> -<|im_start|>user -What is the capital of""") -] - -TEST_MESSAGES = [ - { - 'role': 'user', - 'content': 'Hello' - }, - { - 'role': 'assistant', - 'content': 'Hi there!' - }, - { - 'role': 'user', - 'content': 'What is the capital of' - }, -] - - -@dataclass -class MockTokenizer: - chat_template = None - - -@dataclass -class MockServingChat: - tokenizer: MockTokenizer - - -def test_load_chat_template(): - # Testing chatml template - tokenizer = MockTokenizer() - mock_serving_chat = MockServingChat(tokenizer) - OpenAIServingChat._load_chat_template(mock_serving_chat, - chat_template=chatml_jinja_path) - - template_content = tokenizer.chat_template - - # Test assertions - assert template_content is not None - # Hard coded value for template_chatml.jinja - assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %} -{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" - - -def test_no_load_chat_template(): - # Testing chatml template - template = "../../examples/does_not_exist" - tokenizer = MockTokenizer() - - mock_serving_chat = MockServingChat(tokenizer) - OpenAIServingChat._load_chat_template(mock_serving_chat, - chat_template=template) - template_content = tokenizer.chat_template - - # Test assertions - assert template_content is not None - # Hard coded value for template_chatml.jinja - assert template_content == """../../examples/does_not_exist""" - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model,template,add_generation_prompt,expected_output", - MODEL_TEMPLATE_GENERATON_OUTPUT) -async def test_get_gen_prompt(model, template, add_generation_prompt, - expected_output): - # Initialize the tokenizer - tokenizer = get_tokenizer(tokenizer_name=model) - mock_serving_chat = MockServingChat(tokenizer) - OpenAIServingChat._load_chat_template(mock_serving_chat, - chat_template=template) - - # Create a mock request object using keyword arguments - mock_request = ChatCompletionRequest( - model=model, - messages=TEST_MESSAGES, - add_generation_prompt=add_generation_prompt) - - # Call the function and get the result - result = tokenizer.apply_chat_template( - conversation=mock_request.messages, - tokenize=False, - add_generation_prompt=mock_request.add_generation_prompt) - - # Test assertion - assert result == expected_output, f"The generated prompt does not match the expected output for model {model} and template {template}" diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py deleted file mode 100644 index 3e4d53c5cbe236ea409fbdf485b51410882fd85f..0000000000000000000000000000000000000000 --- a/tests/async_engine/test_request_tracker.py +++ /dev/null @@ -1,75 +0,0 @@ -import pytest - -from vllm.engine.async_llm_engine import RequestTracker -from vllm.outputs import RequestOutput - - -class DummyEvent: - - def __init__(self): - self.flag = False - - def set(self): - self.flag = True - - def clear(self): - self.flag = False - - -def test_request_tracker(): - tracker = RequestTracker() - tracker.new_requests_event = DummyEvent() - stream_1 = tracker.add_request("1") - assert tracker.new_requests_event.flag - new, finished = tracker.get_new_and_finished_requests() - assert not tracker.new_requests_event.flag - assert len(new) == 1 - assert new[0]["request_id"] == "1" - assert not finished - assert not stream_1.finished - - stream_2 = tracker.add_request("2") - stream_3 = tracker.add_request("3") - assert tracker.new_requests_event.flag - new, finished = tracker.get_new_and_finished_requests() - assert not tracker.new_requests_event.flag - assert len(new) == 2 - assert new[0]["request_id"] == "2" - assert new[1]["request_id"] == "3" - assert not finished - assert not stream_2.finished - assert not stream_3.finished - - # request_ids must be unique - with pytest.raises(KeyError): - tracker.add_request("1") - assert not tracker.new_requests_event.flag - - tracker.abort_request("1") - new, finished = tracker.get_new_and_finished_requests() - assert len(finished) == 1 - assert "1" in finished - assert not new - assert stream_1.finished - - stream_4 = tracker.add_request("4") - tracker.abort_request("4") - assert tracker.new_requests_event.flag - new, finished = tracker.get_new_and_finished_requests() - assert len(finished) == 1 - assert "4" in finished - assert not new - assert stream_4.finished - - stream_5 = tracker.add_request("5") - assert tracker.new_requests_event.flag - tracker.process_request_output( - RequestOutput("2", "output", [], [], [], finished=True)) - new, finished = tracker.get_new_and_finished_requests() - assert not tracker.new_requests_event.flag - assert len(finished) == 1 - assert "2" in finished - assert len(new) == 1 - assert new[0]["request_id"] == "5" - assert stream_2.finished - assert not stream_5.finished diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index 8d6afdbd00358803f57c30f67ec7079950c0f388..0000000000000000000000000000000000000000 --- a/tests/conftest.py +++ /dev/null @@ -1,224 +0,0 @@ -import os -from typing import List, Optional, Tuple - -import pytest -import torch -from transformers import AutoModelForCausalLM - -from vllm import LLM, SamplingParams -from vllm.transformers_utils.tokenizer import get_tokenizer - -_TEST_DIR = os.path.dirname(__file__) -_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] -_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] - - -def _read_prompts(filename: str) -> str: - prompts = [] - with open(filename, "r") as f: - prompt = f.readline() - prompts.append(prompt) - return prompts - - -@pytest.fixture -def example_prompts() -> List[str]: - prompts = [] - for filename in _TEST_PROMPTS: - prompts += _read_prompts(filename) - return prompts - - -@pytest.fixture -def example_long_prompts() -> List[str]: - prompts = [] - for filename in _LONG_PROMPTS: - prompts += _read_prompts(filename) - return prompts - - -_STR_DTYPE_TO_TORCH_DTYPE = { - "half": torch.half, - "bfloat16": torch.bfloat16, - "float": torch.float, -} - - -class HfRunner: - - def __init__( - self, - model_name: str, - tokenizer_name: Optional[str] = None, - dtype: str = "half", - ) -> None: - assert dtype in _STR_DTYPE_TO_TORCH_DTYPE - torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] - self.model = AutoModelForCausalLM.from_pretrained( - model_name, - torch_dtype=torch_dtype, - trust_remote_code=True, - ).cuda() - if tokenizer_name is None: - tokenizer_name = model_name - self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True) - - def generate( - self, - prompts: List[str], - **kwargs, - ) -> List[Tuple[List[int], str]]: - outputs: List[Tuple[List[int], str]] = [] - for prompt in prompts: - input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids - output_ids = self.model.generate( - input_ids.cuda(), - use_cache=True, - **kwargs, - ) - output_str = self.tokenizer.batch_decode( - output_ids, - skip_special_tokens=True, - clean_up_tokenization_spaces=False, - ) - output_ids = output_ids.cpu().tolist() - outputs.append((output_ids, output_str)) - return outputs - - def generate_greedy( - self, - prompts: List[str], - max_tokens: int, - ) -> List[Tuple[List[int], str]]: - outputs = self.generate(prompts, - do_sample=False, - max_new_tokens=max_tokens) - for i in range(len(outputs)): - output_ids, output_str = outputs[i] - outputs[i] = (output_ids[0], output_str[0]) - return outputs - - def generate_beam_search( - self, - prompts: List[str], - beam_width: int, - max_tokens: int, - ) -> List[Tuple[List[int], str]]: - outputs = self.generate(prompts, - do_sample=False, - max_new_tokens=max_tokens, - num_beams=beam_width, - num_return_sequences=beam_width) - for i in range(len(outputs)): - output_ids, output_str = outputs[i] - for j in range(len(output_ids)): - output_ids[j] = [ - x for x in output_ids[j] - if x != self.tokenizer.pad_token_id - ] - outputs[i] = (output_ids, output_str) - return outputs - - def generate_greedy_logprobs( - self, - prompts: List[str], - max_tokens: int, - ) -> List[List[torch.Tensor]]: - all_logprobs = [] - for prompt in prompts: - input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids - output = self.model.generate( - input_ids.cuda(), - use_cache=True, - do_sample=False, - max_new_tokens=max_tokens, - output_hidden_states=True, - return_dict_in_generate=True, - ) - seq_logprobs = [] - for hidden_states in output.hidden_states: - last_hidden_states = hidden_states[-1][0] - logits = torch.matmul( - last_hidden_states, - self.model.get_output_embeddings().weight.t(), - ) - if self.model.get_output_embeddings().bias is not None: - logits += self.model.get_output_embeddings( - ).bias.unsqueeze(0) - logprobs = torch.nn.functional.log_softmax(logits, - dim=-1, - dtype=torch.float32) - seq_logprobs.append(logprobs) - all_logprobs.append(seq_logprobs) - return all_logprobs - - -@pytest.fixture -def hf_runner(): - return HfRunner - - -class VllmRunner: - - def __init__( - self, - model_name: str, - tokenizer_name: Optional[str] = None, - dtype: str = "half", - ) -> None: - self.model = LLM( - model=model_name, - tokenizer=tokenizer_name, - trust_remote_code=True, - dtype=dtype, - swap_space=0, - ) - - def generate( - self, - prompts: List[str], - sampling_params: SamplingParams, - ) -> List[Tuple[List[int], str]]: - req_outputs = self.model.generate(prompts, - sampling_params=sampling_params) - outputs = [] - for req_output in req_outputs: - prompt_str = req_output.prompt - prompt_ids = req_output.prompt_token_ids - req_sample_output_ids = [] - req_sample_output_strs = [] - for sample in req_output.outputs: - output_str = sample.text - output_ids = sample.token_ids - req_sample_output_ids.append(prompt_ids + output_ids) - req_sample_output_strs.append(prompt_str + output_str) - outputs.append((req_sample_output_ids, req_sample_output_strs)) - return outputs - - def generate_greedy( - self, - prompts: List[str], - max_tokens: int, - ) -> List[Tuple[List[int], str]]: - greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) - outputs = self.generate(prompts, greedy_params) - return [(output_ids[0], output_str[0]) - for output_ids, output_str in outputs] - - def generate_beam_search( - self, - prompts: List[str], - beam_width: int, - max_tokens: int, - ) -> List[Tuple[List[int], str]]: - beam_search_params = SamplingParams(n=beam_width, - use_beam_search=True, - temperature=0.0, - max_tokens=max_tokens) - outputs = self.generate(prompts, beam_search_params) - return outputs - - -@pytest.fixture -def vllm_runner(): - return VllmRunner diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py deleted file mode 100644 index 9474cb21599d4cb0dec1bbd1fb4434e8ec4392c1..0000000000000000000000000000000000000000 --- a/tests/distributed/test_comm_ops.py +++ /dev/null @@ -1,92 +0,0 @@ -"""Test the communication operators. - -Run `pytest tests/distributed/test_comm_ops.py --forked`. -""" -import pytest -import torch -import ray - -from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_all_reduce, - tensor_model_parallel_all_gather, - broadcast_tensor_dict, -) -from vllm.test_utils import (init_test_distributed_environment, - multi_process_tensor_parallel) - - -@ray.remote(num_gpus=1, max_calls=1) -def all_reduce_test_worker(tensor_parallel_size: int, rank: int, - distributed_init_port: str): - init_test_distributed_environment(1, tensor_parallel_size, rank, - distributed_init_port) - num_elements = 8 - all_tensors = [ - torch.arange(num_elements, dtype=torch.float32, device="cuda") * - (r + 1) for r in range(tensor_parallel_size) - ] - expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0) - t = all_tensors[rank] - t = tensor_model_parallel_all_reduce(t) - assert torch.allclose(t, expected) - - -@ray.remote(num_gpus=1, max_calls=1) -def all_gather_test_worker(tensor_parallel_size: int, rank: int, - distributed_init_port: str): - init_test_distributed_environment(1, tensor_parallel_size, rank, - distributed_init_port) - num_dimensions = 3 - tensor_size = list(range(2, num_dimensions + 2)) - total_size = 1 - for s in tensor_size: - total_size *= s - for all_gather_dimension in range(num_dimensions): - all_tensors = [ - torch.arange(total_size, dtype=torch.float32, - device="cuda").reshape(tensor_size) * (r + 1) - for r in range(tensor_parallel_size) - ] - expected = torch.cat(all_tensors, dim=all_gather_dimension) - t = all_tensors[rank] - t = tensor_model_parallel_all_gather(t, all_gather_dimension) - assert torch.allclose(t, expected) - - -@ray.remote(num_gpus=1, max_calls=1) -def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int, - distributed_init_port: str): - init_test_distributed_environment(1, tensor_parallel_size, rank, - distributed_init_port) - test_dict = { - "a": torch.arange(8, dtype=torch.float32, device="cuda"), - "b": torch.arange(16, dtype=torch.int8, device="cuda"), - "c": "test", - "d": [1, 2, 3], - "e": { - "a": 1, - "b": 2 - }, - } - - if rank == 0: - broadcast_tensor_dict(test_dict, src=0) - else: - recv_dict = broadcast_tensor_dict(src=0) - assert len(recv_dict) == len(test_dict) - assert torch.allclose(recv_dict["a"], test_dict["a"]) - assert torch.allclose(recv_dict["b"], test_dict["b"]) - assert recv_dict["c"] == test_dict["c"] - assert recv_dict["d"] == test_dict["d"] - assert recv_dict["e"] == test_dict["e"] - - -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize("tensor_parallel_size", [2]) -@pytest.mark.parametrize("test_target", [ - all_reduce_test_worker, all_gather_test_worker, - broadcast_tensor_dict_test_worker -]) -def test_multi_process_tensor_parallel(tensor_parallel_size, test_target): - multi_process_tensor_parallel(tensor_parallel_size, test_target) diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py deleted file mode 100644 index ed4965593c2f0df06c9de540e02265ec0ad8b975..0000000000000000000000000000000000000000 --- a/tests/distributed/test_custom_all_reduce.py +++ /dev/null @@ -1,85 +0,0 @@ -import random - -import os -import pytest -import ray -import torch -import torch.distributed as dist - -from vllm.model_executor.parallel_utils import custom_all_reduce as custom_ar -from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_all_reduce) -from vllm.test_utils import (init_test_distributed_environment, - multi_process_tensor_parallel) - -random.seed(42) -test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)] -for i, v in enumerate(test_sizes): - test_sizes[i] -= v % 8 - - -@ray.remote(num_gpus=1, max_calls=1) -def graph_allreduce(world_size, rank, distributed_init_port): - del os.environ["CUDA_VISIBLE_DEVICES"] - device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) - init_test_distributed_environment(1, world_size, rank, - distributed_init_port) - - custom_ar.init_custom_ar() - for sz in test_sizes: - for dtype in [torch.float32, torch.float16, torch.bfloat16]: - with custom_ar.capture(): - # use integers so result matches NCCL exactly - inp1 = torch.randint(1, - 16, (sz, ), - dtype=dtype, - device=torch.cuda.current_device()) - inp2 = torch.randint(1, - 16, (sz, ), - dtype=dtype, - device=torch.cuda.current_device()) - torch.cuda.synchronize() - graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(graph): - out1 = tensor_model_parallel_all_reduce(inp1) - # the input buffer is immediately modified to test - # synchronization - dist.all_reduce(inp1) - out2 = tensor_model_parallel_all_reduce(inp2) - dist.all_reduce(inp2) - graph.replay() - assert torch.allclose(out1, inp1) - assert torch.allclose(out2, inp2) - - -@ray.remote(num_gpus=1, max_calls=1) -def eager_allreduce(world_size, rank, distributed_init_port): - del os.environ["CUDA_VISIBLE_DEVICES"] - device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) - init_test_distributed_environment(1, world_size, rank, - distributed_init_port) - - sz = 1024 - custom_ar.init_custom_ar() - fa = custom_ar.get_handle() - inp = torch.ones(sz, dtype=torch.float32, device=device) - out = fa.all_reduce_unreg(inp) - assert torch.allclose(out, inp * world_size) - - inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device) - out = fa.all_reduce_unreg(inp) - assert torch.allclose(out, inp * world_size) - - -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize("tensor_parallel_size", [2]) -@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce]) -def test_multi_process_tensor_parallel(tensor_parallel_size, test_target): - multi_process_tensor_parallel(tensor_parallel_size, test_target) - - -if __name__ == "__main__": - multi_process_tensor_parallel(2, graph_allreduce) diff --git a/tests/engine/test_detokenize.py b/tests/engine/test_detokenize.py deleted file mode 100644 index 4421739390e3ba41fae3225caa952c36247ce0f9..0000000000000000000000000000000000000000 --- a/tests/engine/test_detokenize.py +++ /dev/null @@ -1,62 +0,0 @@ -import pytest - -from transformers import AutoTokenizer - -from vllm.transformers_utils.tokenizer import detokenize_incrementally - -TRUTH = [ - "Hello here, this is a simple test", # noqa: E501 - "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", # noqa: E501 - "我很感谢你的热情" # noqa: E501 -] -TOKENIZERS = [ - "facebook/opt-125m", - "gpt2", - "bigcode/tiny_starcoder_py", - "EleutherAI/gpt-j-6b", - "EleutherAI/pythia-70m", - "bigscience/bloom-560m", - "mosaicml/mpt-7b", - "tiiuae/falcon-7b", - "meta-llama/Llama-2-7b-hf", - "codellama/CodeLlama-7b-hf", -] - - -def _run_incremental_decode(tokenizer, all_input_ids, - skip_special_tokens: bool): - decoded_text = "" - offset = 0 - token_offset = 0 - prev_tokens = None - for i in range(len(all_input_ids)): - new_tokens, text, offset, token_offset = detokenize_incrementally( - tokenizer, - all_input_ids[:i + 1], - prev_tokens, - offset, - token_offset, - skip_special_tokens=skip_special_tokens) - decoded_text += text - if prev_tokens is None: - prev_tokens = new_tokens - else: - prev_tokens += new_tokens - return decoded_text - - -@pytest.mark.parametrize("truth", TRUTH) -@pytest.mark.parametrize("tokenizer_id", TOKENIZERS) -@pytest.mark.parametrize("skip_special_tokens", (True, False)) -def test_decode_streaming(tokenizer_id, truth, skip_special_tokens): - tokenizer = AutoTokenizer.from_pretrained(tokenizer_id) - all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"] - if skip_special_tokens: - all_input_ids = ([tokenizer.bos_token_id] - if tokenizer.bos_token_id is not None else - []) + all_input_ids + [tokenizer.eos_token_id] - - decoded_text = _run_incremental_decode( - tokenizer, all_input_ids, skip_special_tokens=skip_special_tokens) - - assert decoded_text == truth diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py deleted file mode 100644 index 54522f0a99fa1f3e575eca24ea5191c9a9161044..0000000000000000000000000000000000000000 --- a/tests/entrypoints/test_openai_server.py +++ /dev/null @@ -1,254 +0,0 @@ -import os -import subprocess -import time - -import sys -import pytest -import requests -import ray # using Ray for overall ease of process management, parallel requests, and debugging. -import openai # use the official client for correctness check - -MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" # any model with a chat template should work here - -pytestmark = pytest.mark.asyncio - - -@ray.remote(num_gpus=1) -class ServerRunner: - - def __init__(self, args): - env = os.environ.copy() - env["PYTHONUNBUFFERED"] = "1" - self.proc = subprocess.Popen( - ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args, - env=env, - stdout=sys.stdout, - stderr=sys.stderr, - ) - self._wait_for_server() - - def ready(self): - return True - - def _wait_for_server(self): - # run health check - start = time.time() - while True: - try: - if requests.get( - "http://localhost:8000/health").status_code == 200: - break - except Exception as err: - if self.proc.poll() is not None: - raise RuntimeError("Server exited unexpectedly.") from err - - time.sleep(0.5) - if time.time() - start > MAX_SERVER_START_WAIT_S: - raise RuntimeError( - "Server failed to start in time.") from err - - def __del__(self): - if hasattr(self, "proc"): - self.proc.terminate() - - -@pytest.fixture(scope="session") -def server(): - ray.init() - server_runner = ServerRunner.remote([ - "--model", - MODEL_NAME, - "--dtype", - "bfloat16", # use half precision for speed and memory savings in CI environment - "--max-model-len", - "8192", - "--enforce-eager", - ]) - ray.get(server_runner.ready.remote()) - yield server_runner - ray.shutdown() - - -@pytest.fixture(scope="session") -def client(): - client = openai.AsyncOpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", - ) - yield client - - -async def test_single_completion(server, client: openai.AsyncOpenAI): - completion = await client.completions.create(model=MODEL_NAME, - prompt="Hello, my name is", - max_tokens=5, - temperature=0.0) - - assert completion.id is not None - assert completion.choices is not None and len(completion.choices) == 1 - assert completion.choices[0].text is not None and len( - completion.choices[0].text) >= 5 - assert completion.choices[0].finish_reason == "length" - assert completion.usage == openai.types.CompletionUsage( - completion_tokens=5, prompt_tokens=6, total_tokens=11) - - # test using token IDs - completion = await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - ) - assert completion.choices[0].text is not None and len( - completion.choices[0].text) >= 5 - - -async def test_single_chat_session(server, client: openai.AsyncOpenAI): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": "user", - "content": "what is 1+1?" - }] - - # test single completion - chat_completion = await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=10, - ) - assert chat_completion.id is not None - assert chat_completion.choices is not None and len( - chat_completion.choices) == 1 - assert chat_completion.choices[0].message is not None - message = chat_completion.choices[0].message - assert message.content is not None and len(message.content) >= 10 - assert message.role == "assistant" - messages.append({"role": "assistant", "content": message.content}) - - # test multi-turn dialogue - messages.append({"role": "user", "content": "express your result in json"}) - chat_completion = await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=10, - ) - message = chat_completion.choices[0].message - assert message.content is not None and len(message.content) >= 0 - - -async def test_completion_streaming(server, client: openai.AsyncOpenAI): - prompt = "What is an LLM?" - - single_completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=5, - temperature=0.0, - ) - single_output = single_completion.choices[0].text - single_usage = single_completion.usage - - stream = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - ) - chunks = [] - async for chunk in stream: - chunks.append(chunk.choices[0].text) - assert chunk.choices[0].finish_reason == "length" - assert chunk.usage == single_usage - assert "".join(chunks) == single_output - - -async def test_chat_streaming(server, client: openai.AsyncOpenAI): - messages = [{ - "role": "system", - "content": "you are a helpful assistant" - }, { - "role": "user", - "content": "what is 1+1?" - }] - - # test single completion - chat_completion = await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=10, - temperature=0.0, - ) - output = chat_completion.choices[0].message.content - stop_reason = chat_completion.choices[0].finish_reason - - # test streaming - stream = await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=True, - ) - chunks = [] - async for chunk in stream: - delta = chunk.choices[0].delta - if delta.role: - assert delta.role == "assistant" - if delta.content: - chunks.append(delta.content) - assert chunk.choices[0].finish_reason == stop_reason - assert "".join(chunks) == output - - -async def test_batch_completions(server, client: openai.AsyncOpenAI): - # test simple list - batch = await client.completions.create( - model=MODEL_NAME, - prompt=["Hello, my name is", "Hello, my name is"], - max_tokens=5, - temperature=0.0, - ) - assert len(batch.choices) == 2 - assert batch.choices[0].text == batch.choices[1].text - - # test n = 2 - batch = await client.completions.create( - model=MODEL_NAME, - prompt=["Hello, my name is", "Hello, my name is"], - n=2, - max_tokens=5, - temperature=0.0, - extra_body=dict( - # NOTE: this has to be true for n > 1 in vLLM, but not necessary for official client. - use_beam_search=True), - ) - assert len(batch.choices) == 4 - assert batch.choices[0].text != batch.choices[ - 1].text, "beam search should be different" - assert batch.choices[0].text == batch.choices[ - 2].text, "two copies of the same prompt should be the same" - assert batch.choices[1].text == batch.choices[ - 3].text, "two copies of the same prompt should be the same" - - # test streaming - batch = await client.completions.create( - model=MODEL_NAME, - prompt=["Hello, my name is", "Hello, my name is"], - max_tokens=5, - temperature=0.0, - stream=True, - ) - texts = [""] * 2 - async for chunk in batch: - assert len(chunk.choices) == 1 - choice = chunk.choices[0] - texts[choice.index] += choice.text - assert texts[0] == texts[1] - - -if __name__ == "__main__": - pytest.main([__file__]) diff --git a/tests/kernels/conftest.py b/tests/kernels/conftest.py deleted file mode 100644 index 8c51bfc149efe47187d67f04d602d18617662a32..0000000000000000000000000000000000000000 --- a/tests/kernels/conftest.py +++ /dev/null @@ -1,7 +0,0 @@ -import pytest -from vllm.utils import create_kv_caches_with_random - - -@pytest.fixture() -def kv_cache_factory(): - return create_kv_caches_with_random diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py deleted file mode 100644 index 826bf8350af17705de2584946545694b3d989099..0000000000000000000000000000000000000000 --- a/tests/kernels/test_activation.py +++ /dev/null @@ -1,78 +0,0 @@ -import pytest -import torch - -from vllm.model_executor.layers.activation import FastGELU, NewGELU, SiluAndMul - -DTYPES = [torch.half, torch.bfloat16, torch.float] -NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing -D = [512, 4096, 5120, 13824] # Arbitrary values for testing -SEEDS = [0] -DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] - - -@pytest.mark.parametrize("num_tokens", NUM_TOKENS) -@pytest.mark.parametrize("d", D) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) -@torch.inference_mode() -def test_silu_and_mul( - num_tokens: int, - d: int, - dtype: torch.dtype, - seed: int, - device: int, -) -> None: - torch.random.manual_seed(seed) - torch.cuda.manual_seed(seed) - gpu_id = f"cuda:{device}" - x = torch.randn(num_tokens, 2 * d, dtype=dtype, device=gpu_id) - layer = SiluAndMul() - out = layer(x) - ref_out = layer._forward(x) - assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5) - - -@pytest.mark.parametrize("num_tokens", NUM_TOKENS) -@pytest.mark.parametrize("d", D) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) -@torch.inference_mode() -def test_gelu_new( - num_tokens: int, - d: int, - dtype: torch.dtype, - seed: int, - device: int, -) -> None: - torch.random.manual_seed(seed) - torch.cuda.manual_seed(seed) - gpu_id = f"cuda:{device}" - x = torch.randn(num_tokens, d, dtype=dtype, device=gpu_id) - layer = NewGELU() - out = layer(x) - ref_out = layer._forward(x) - assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5) - - -@pytest.mark.parametrize("num_tokens", NUM_TOKENS) -@pytest.mark.parametrize("d", D) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) -def test_gelu_fast( - num_tokens: int, - d: int, - dtype: torch.dtype, - seed: int, - device: int, -) -> None: - torch.random.manual_seed(seed) - torch.cuda.manual_seed(seed) - gpu_id = f"cuda:{device}" - x = torch.randn(num_tokens, d, dtype=dtype, device=gpu_id) - layer = FastGELU() - out = layer(x) - ref_out = layer._forward(x) - assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5) diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py deleted file mode 100644 index cbb1d40623c71dfb9c9fe90cc285bd27ca1490ae..0000000000000000000000000000000000000000 --- a/tests/kernels/test_attention.py +++ /dev/null @@ -1,365 +0,0 @@ -import random -from typing import List, Optional, Tuple - -import pytest -import torch -from xformers import ops as xops -from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask - -from vllm._C import ops, cache_ops -from vllm.utils import get_max_shared_memory_bytes - -FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 -# This will change depending on the compute capability. -# - 512 as a buffer -MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 -# There may not be enough gpu memory due to large NUM_BLOCKS. -# Reduce NUM_BLOCKS when it happens. -NUM_BLOCKS = 4321 # Arbitrary values for testing -PARTITION_SIZE = 512 - -DTYPES = [torch.half, torch.bfloat16, torch.float] -NUM_GEN_SEQS = [7] # Arbitrary values for testing -NUM_PREFILL_SEQS = [3] # Arbitrary values for testing -NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing -HEAD_SIZES = [64, 80, 96, 112, 128, 256] -BLOCK_SIZES = [16, 32] -USE_ALIBI = [False, True] -KV_CACHE_DTYPE = ["auto", "fp8_e5m2"] -SEEDS = [0] -DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] - - -def ref_masked_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - scale: float, - attn_mask: Optional[torch.Tensor] = None, -) -> torch.Tensor: - attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float() - if attn_mask is not None: - attn_weights = attn_weights + attn_mask.float() - attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype) - out = torch.einsum("hqk,khd->qhd", attn_weights, value) - return out - - -def ref_single_query_cached_kv_attention( - output: torch.Tensor, - query: torch.Tensor, - num_queries_per_kv: int, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - block_tables: torch.Tensor, - context_lens: torch.Tensor, - scale: float, - alibi_slopes: Optional[torch.Tensor], -) -> None: - num_query_heads = query.shape[1] - num_kv_heads = value_cache.shape[1] - head_size = value_cache.shape[2] - block_size = value_cache.shape[3] - num_seqs = query.shape[0] - - block_tables = block_tables.cpu().tolist() - context_lens = context_lens.cpu().tolist() - for i in range(num_seqs): - q = query[i].unsqueeze(0) - block_table = block_tables[i] - context_len = int(context_lens[i]) - - keys = [] - values = [] - for j in range(context_len): - block_number = int(block_table[j // block_size]) - block_offset = j % block_size - - k = key_cache[block_number, :, :, block_offset, :] - k = k.reshape(num_kv_heads, head_size) - keys.append(k) - - v = value_cache[block_number, :, :, block_offset] - values.append(v) - keys = torch.stack(keys, dim=0) - values = torch.stack(values, dim=0) - if num_queries_per_kv > 1: - # Handle MQA and GQA - keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1) - values = torch.repeat_interleave(values, num_queries_per_kv, dim=1) - - alibi_bias = None - if alibi_slopes is not None: - # Create the ALiBi bias used in the paged attention kernel. - position_ids = torch.arange(context_len, device=query.device).int() - alibi_bias = (position_ids - context_len + 1).float() - alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view( - 1, 1, -1) - - out = ref_masked_attention(q, keys, values, scale, alibi_bias) - out = out.view(num_query_heads, head_size) - output[i].copy_(out, non_blocking=True) - - -@pytest.mark.parametrize("version", ["v1", "v2"]) -@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("use_alibi", USE_ALIBI) -@pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) -def test_paged_attention( - kv_cache_factory, - version: str, - num_seqs: int, - num_heads: Tuple[int, int], - head_size: int, - use_alibi: bool, - block_size: int, - dtype: torch.dtype, - kv_cache_dtype: str, - seed: int, - device: int, -) -> None: - random.seed(seed) - torch.random.manual_seed(seed) - torch.cuda.manual_seed(seed) - gpu_id = f"cuda:{device}" - scale = float(1.0 / (head_size**0.5)) - num_query_heads, num_kv_heads = num_heads - query = torch.empty(num_seqs, - num_query_heads, - head_size, - dtype=dtype, - device=gpu_id) - query.uniform_(-scale, scale) - - assert num_query_heads % num_kv_heads == 0 - num_queries_per_kv = num_query_heads // num_kv_heads - alibi_slopes = None - if use_alibi: - alibi_slopes = torch.randn(num_query_heads, - dtype=torch.float, - device=gpu_id) - - context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)] - context_lens[-1] = MAX_SEQ_LEN - max_context_len = max(context_lens) - context_lens = torch.tensor(context_lens, dtype=torch.int, device=gpu_id) - - # Create the block tables. - max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size - block_tables = [] - for _ in range(num_seqs): - block_table = [ - random.randint(0, NUM_BLOCKS - 1) - for _ in range(max_num_blocks_per_seq) - ] - block_tables.append(block_table) - block_tables = torch.tensor(block_tables, dtype=torch.int, device=gpu_id) - - # Create the KV caches. - key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1, - num_kv_heads, head_size, - kv_cache_dtype, dtype, seed, - gpu_id) - key_cache, value_cache = key_caches[0], value_caches[0] - - # Call the paged attention kernel. - output = torch.empty_like(query) - if version == "v1": - ops.paged_attention_v1( - output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - block_tables, - context_lens, - block_size, - max_context_len, - alibi_slopes, - kv_cache_dtype, - ) - elif version == "v2": - num_partitions = ((max_context_len + PARTITION_SIZE - 1) // - PARTITION_SIZE) - assert PARTITION_SIZE % block_size == 0 - num_seqs, num_heads, head_size = output.shape - tmp_output = torch.empty( - size=(num_seqs, num_heads, num_partitions, head_size), - dtype=output.dtype, - device=output.device, - ) - exp_sums = torch.empty( - size=(num_seqs, num_heads, num_partitions), - dtype=torch.float32, - device=output.device, - ) - max_logits = torch.empty_like(exp_sums) - ops.paged_attention_v2( - output, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - block_tables, - context_lens, - block_size, - max_context_len, - alibi_slopes, - kv_cache_dtype, - ) - else: - raise AssertionError(f"Unknown version: {version}") - - # Run the reference implementation. - if kv_cache_dtype == "fp8_e5m2": - # Convert cache data back to dtype. - x = 16 // torch.tensor([], dtype=dtype).element_size() - key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, - block_size, x) - dequantized_key_cache = torch.empty(size=key_cache_shape, - dtype=dtype, - device=gpu_id) - cache_ops.convert_fp8_e5m2(key_cache, dequantized_key_cache) - key_cache = dequantized_key_cache - - value_cache_shape = value_cache.shape - dequantized_value_cache = torch.empty(size=value_cache_shape, - dtype=dtype, - device=gpu_id) - cache_ops.convert_fp8_e5m2(value_cache, dequantized_value_cache) - value_cache = dequantized_value_cache - - ref_output = torch.empty_like(query) - ref_single_query_cached_kv_attention( - ref_output, - query, - num_queries_per_kv, - key_cache, - value_cache, - block_tables, - context_lens, - scale, - alibi_slopes, - ) - - # NOTE(woosuk): Due to the kernel-level differences in the two - # implementations, there is a small numerical difference in the two - # outputs. Thus, we use a relaxed tolerance for the test. - # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error, - # so we use a relaxed tolerance for the test. - atol, rtol = 1e-3, 1e-5 - if kv_cache_dtype == "fp8_e5m2": - atol, rtol = 1e-2, 1e-5 - assert torch.allclose(output, ref_output, atol=atol, rtol=rtol) - - -def ref_multi_query_kv_attention( - cu_seq_lens: List[int], - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - scale: float, - dtype: torch.dtype, -) -> torch.Tensor: - num_seqs = len(cu_seq_lens) - 1 - ref_outputs = [] - for i in range(num_seqs): - start_idx = cu_seq_lens[i] - end_idx = cu_seq_lens[i + 1] - seq_len = end_idx - start_idx - - # Create attention mask. - attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype), - diagonal=1) - attn_mask = attn_mask * torch.finfo(dtype).min - attn_mask = attn_mask.to(dtype=dtype, device=query.device) - - ref_output = ref_masked_attention( - query[start_idx:end_idx], - key[start_idx:end_idx], - value[start_idx:end_idx], - scale, - attn_mask=attn_mask, - ) - ref_outputs.append(ref_output) - ref_output = torch.cat(ref_outputs, dim=0) - return ref_output - - -# TODO(woosuk): Add tests for USE_ALIBI=True. -@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) -@torch.inference_mode() -def test_multi_query_kv_attention( - num_seqs: int, - num_heads: Tuple[int, int], - head_size: int, - dtype: torch.dtype, - seed: int, - device: int, -) -> None: - random.seed(seed) - torch.random.manual_seed(seed) - torch.cuda.manual_seed(seed) - gpu_id = f"cuda:{device}" - # MAX_SEQ_LEN sometimes causes OOM in the reference implementation. - # As the xformers library is already tested with its own tests, we can use - # a smaller MAX_SEQ_LEN here. - max_len = min(MAX_SEQ_LEN, 4096) - seq_lens = random.sample(range(1, max_len), num_seqs) - num_tokens = sum(seq_lens) - - scale = float(1.0 / (head_size**0.5)) - num_query_heads, num_kv_heads = num_heads - qkv = torch.empty(num_tokens, - num_query_heads + 2 * num_kv_heads, - head_size, - dtype=dtype, - device=gpu_id) - qkv.uniform_(-scale, scale) - query, key, value = qkv.split( - [num_query_heads, num_kv_heads, num_kv_heads], dim=1) - - num_queries_per_kv = num_query_heads // num_kv_heads - if num_queries_per_kv > 1: - # Handle MQA and GQA - key = torch.repeat_interleave(key, num_queries_per_kv, dim=1) - value = torch.repeat_interleave(value, num_queries_per_kv, dim=1) - attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens) - output = xops.memory_efficient_attention_forward( - query.unsqueeze(0), - key.unsqueeze(0), - value.unsqueeze(0), - attn_bias=attn_bias, - p=0.0, - scale=scale, - ) - output = output.squeeze(0) - - cu_seq_lens = [0] - for seq_len in seq_lens: - cu_seq_lens.append(cu_seq_lens[-1] + seq_len) - ref_output = ref_multi_query_kv_attention( - cu_seq_lens, - query, - key, - value, - scale, - dtype, - ) - assert torch.allclose(output, ref_output, atol=1e-3, rtol=1e-5) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py deleted file mode 100644 index 193bc29bd431def6af86a5fc5e010fce797842b2..0000000000000000000000000000000000000000 --- a/tests/kernels/test_cache.py +++ /dev/null @@ -1,155 +0,0 @@ -import random - -import pytest -import torch - -from vllm._C import cache_ops - -DTYPES = [torch.half, torch.bfloat16, torch.float] -NUM_TOKENS = [42] # Arbitrary values for testing -NUM_LAYERS = [1] # Arbitrary values for testing -NUM_HEADS = [8] # Arbitrary values for testing -HEAD_SIZES = [64, 80, 96, 112, 128, 256] -BLOCK_SIZES = [8, 16, 32] -NUM_BLOCKS = [1024, 3600] # Arbitrary values for testing -NUM_MAPPINGS = [256] # Arbitrary values for testing -SEEDS = [0] -DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] -KV_CACHE_DTYPE = ["auto", "fp8_e5m2"] - - -@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) -@pytest.mark.parametrize("num_layers", NUM_LAYERS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) -@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) -@torch.inference_mode() -def test_copy_blocks( - kv_cache_factory, - num_mappings: int, - num_layers: int, - num_heads: int, - head_size: int, - block_size: int, - num_blocks: int, - dtype: torch.dtype, - seed: int, - device: int, - kv_cache_dtype: str, -) -> None: - random.seed(seed) - torch.random.manual_seed(seed) - torch.cuda.manual_seed(seed) - gpu_id = f"cuda:{device}" - # Generate random block mappings where each source block is mapped to two - # destination blocks. - assert 2 * num_mappings <= num_blocks - src_blocks = random.sample(range(num_blocks), num_mappings) - remainig_blocks = list(set(range(num_blocks)) - set(src_blocks)) - dst_blocks = random.sample(remainig_blocks, 2 * num_mappings) - block_mapping = {} - for i in range(num_mappings): - src = src_blocks[i] - dst1 = dst_blocks[2 * i] - dst2 = dst_blocks[2 * i + 1] - block_mapping[src] = [dst1, dst2] - - # Create the KV caches. - key_caches, value_caches = kv_cache_factory(num_blocks, block_size, - num_layers, num_heads, - head_size, kv_cache_dtype, - dtype, seed, gpu_id) - - # Clone the KV caches. - cloned_key_caches = [key_cache.clone() for key_cache in key_caches] - cloned_value_caches = [value_cache.clone() for value_cache in value_caches] - - # Call the copy blocks kernel. - cache_ops.copy_blocks(key_caches, value_caches, block_mapping) - - # Run the reference implementation. - for src, dsts in block_mapping.items(): - for dst in dsts: - for cloned_key_cache in cloned_key_caches: - cloned_key_cache[dst].copy_(cloned_key_cache[src]) - for cloned_value_cache in cloned_value_caches: - cloned_value_cache[dst].copy_(cloned_value_cache[src]) - - # Compare the results. - for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches): - assert torch.allclose(key_cache, cloned_key_cache) - for value_cache, cloned_value_cache in zip(value_caches, - cloned_value_caches): - assert torch.allclose(value_cache, cloned_value_cache) - - -@pytest.mark.parametrize("num_tokens", NUM_TOKENS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) -@torch.inference_mode() -def test_reshape_and_cache( - kv_cache_factory, - num_tokens: int, - num_heads: int, - head_size: int, - block_size: int, - num_blocks: int, - dtype: torch.dtype, - seed: int, - device: int, -) -> None: - random.seed(seed) - torch.random.manual_seed(seed) - torch.cuda.manual_seed(seed) - gpu_id = f"cuda:{device}" - # Create a random slot mapping. - num_slots = block_size * num_blocks - slot_mapping = random.sample(range(num_slots), num_tokens) - slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=gpu_id) - - qkv = torch.randn(num_tokens, - 3, - num_heads, - head_size, - dtype=dtype, - device=gpu_id) - _, key, value = qkv.unbind(dim=1) - - # Create the KV caches. - key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1, - num_heads, head_size, dtype, - None, seed, gpu_id) - key_cache, value_cache = key_caches[0], value_caches[0] - - # Clone the KV caches. - cloned_key_cache = key_cache.clone() - cloned_value_cache = value_cache.clone() - - # Call the reshape_and_cache kernel. - cache_ops.reshape_and_cache(key, value, key_cache, value_cache, - slot_mapping, "auto") - - # Run the reference implementation. - reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) - block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") - block_indicies = block_indicies.cpu().tolist() - block_offsets = slot_mapping % block_size - block_offsets = block_offsets.cpu().tolist() - for i in range(num_tokens): - block_idx = block_indicies[i] - block_offset = block_offsets[i] - cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] - cloned_value_cache[block_idx, :, :, block_offset] = value[i] - - assert torch.allclose(key_cache, cloned_key_cache) - assert torch.allclose(value_cache, cloned_value_cache) diff --git a/tests/kernels/test_fused_moe.py b/tests/kernels/test_fused_moe.py deleted file mode 100644 index 80a0349d6575b4e7ab30382b24441b8c2f59e570..0000000000000000000000000000000000000000 --- a/tests/kernels/test_fused_moe.py +++ /dev/null @@ -1,50 +0,0 @@ -import pytest -import torch - -from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.activation import SiluAndMul - - -def torch_moe(a, w1, w2, topk_weight, topk_ids): - B, D = a.shape - a = a.view(B, -1, D).repeat(1, topk_ids.shape[1], 1).reshape(-1, D) - out = torch.zeros(B * topk_ids.shape[1], - w2.shape[1], - dtype=a.dtype, - device=a.device) - topk_ids = topk_ids.view(-1) - topk_weight = topk_weight.view(-1) - for i in range(w1.shape[0]): - mask = topk_ids == i - if mask.sum(): - out[mask] = SiluAndMul()( - a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1) - return (out.view(B, -1, w2.shape[1]) * - topk_weight.view(B, -1, 1)).sum(dim=1) - - -@pytest.mark.parametrize("m", [512, 222, 33, 1]) -@pytest.mark.parametrize("n", [2048, 256, 1024]) -@pytest.mark.parametrize("k", [128, 511, 1024]) -@pytest.mark.parametrize("e", [8, 64]) -@pytest.mark.parametrize("topk", [2, 6]) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) -def test_fused_moe( - m: int, - n: int, - k: int, - e: int, - topk: int, - dtype: torch.dtype, -): - a = torch.randn((m, k), device='cuda', dtype=dtype) / 10 - w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10 - w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10 - - score = torch.randn((m, e), device='cuda', dtype=dtype) - score = torch.softmax(score, dim=-1) - topk_weight, topk_ids = torch.topk(score, topk) - - triton_output = fused_moe(a, w1, w2, topk_weight, topk_ids, False) - torch_output = torch_moe(a, w1, w2, topk_weight, topk_ids) - assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0) diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py deleted file mode 100644 index 8a06b3aa268be1ef21a066fb7e927f8a1d94ca07..0000000000000000000000000000000000000000 --- a/tests/kernels/test_layernorm.py +++ /dev/null @@ -1,50 +0,0 @@ -import pytest -import torch - -from vllm.model_executor.layers.layernorm import RMSNorm - -DTYPES = [torch.half, torch.bfloat16, torch.float] -NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing -HIDDEN_SIZES = [768, 5120, 8192] # Arbitrary values for testing -ADD_RESIDUAL = [False, True] -SEEDS = [0] -DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] - - -@pytest.mark.parametrize("num_tokens", NUM_TOKENS) -@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) -@pytest.mark.parametrize("add_residual", ADD_RESIDUAL) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) -@torch.inference_mode() -def test_rms_norm( - num_tokens: int, - hidden_size: int, - add_residual: bool, - dtype: torch.dtype, - seed: int, - device: int, -) -> None: - torch.random.manual_seed(seed) - torch.cuda.manual_seed(seed) - gpu_id = f"cuda:{device}" - layer = RMSNorm(hidden_size).to(dtype=dtype, device=gpu_id) - layer.weight.data.normal_(mean=1.0, std=0.1) - scale = 1 / (2 * hidden_size) - x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=gpu_id) - x *= scale - residual = torch.randn_like(x) * scale if add_residual else None - - # NOTE(woosuk): The reference implementation should be executed first - # because the custom kernel is in-place. - ref_out = layer._forward(x, residual) - out = layer(x, residual) - # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger - # numerical errors than other operators because they involve reductions. - # Therefore, we use a larger tolerance. - if add_residual: - assert torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2) - assert torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2) - else: - assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2) diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py deleted file mode 100644 index aad310e2bc6d2a67c8a1b60d393d41f5ab52deaf..0000000000000000000000000000000000000000 --- a/tests/kernels/test_pos_encoding.py +++ /dev/null @@ -1,68 +0,0 @@ -from typing import Optional - -import pytest -import torch - -from vllm.model_executor.layers.rotary_embedding import get_rope - -IS_NEOX_STYLE = [True, False] -DTYPES = [torch.half, torch.bfloat16, torch.float] -HEAD_SIZES = [64, 80, 96, 112, 128, 256] -ROTARY_DIMS = [None, 32] # None means rotary dim == head size -NUM_HEADS = [7, 17] # Arbitrary values for testing -BATCH_SIZES = [1, 5] # Arbitrary values for testing -SEQ_LENS = [11, 8192] # Arbitrary values for testing -SEEDS = [0] -DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] - - -@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) -@pytest.mark.parametrize("batch_size", BATCH_SIZES) -@pytest.mark.parametrize("seq_len", SEQ_LENS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) -@torch.inference_mode() -def test_rotary_embedding( - is_neox_style: bool, - batch_size: int, - seq_len: int, - num_heads: int, - head_size: int, - rotary_dim: Optional[int], - dtype: torch.dtype, - seed: int, - device: int, - max_position: int = 8192, - base: int = 10000, -) -> None: - if rotary_dim is None: - rotary_dim = head_size - torch.random.manual_seed(seed) - torch.cuda.manual_seed(seed) - gpu_id = f"cuda:{device}" - if rotary_dim is None: - rotary_dim = head_size - rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style) - rope = rope.to(dtype=dtype, device=gpu_id) - - positions = torch.randint(0, - max_position, (batch_size, seq_len), - device=gpu_id) - query = torch.randn(batch_size, - seq_len, - num_heads * head_size, - dtype=dtype, - device=gpu_id) - key = torch.randn_like(query) - - # NOTE(woosuk): The reference implementation should be executed first - # because the custom kernel is in-place. - ref_query, ref_key = rope._forward(positions, query, key) - out_query, out_key = rope.forward(positions, query, key) - # Compare the results. - assert torch.allclose(out_query, ref_query, atol=1e-5, rtol=1e-5) - assert torch.allclose(out_key, ref_key, atol=1e-5, rtol=1e-5) diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py deleted file mode 100644 index 0531b05135fb91d4294b32d61b639c2f92ada308..0000000000000000000000000000000000000000 --- a/tests/kernels/test_prefix_prefill.py +++ /dev/null @@ -1,169 +0,0 @@ -import random -import pytest -import time - -import torch -from vllm.model_executor.layers.triton_kernel.prefix_prefill import ( - context_attention_fwd) -from xformers import ops as xops -from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask - -NUM_HEADS = [12] -HEAD_SIZES = [128] -DTYPES = [torch.float16] - - -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("dtype", DTYPES) -@torch.inference_mode() -def test_contexted_kv_attention( - num_heads: int, - head_size: int, - dtype: torch.dtype, -) -> None: - random.seed(0) - torch.manual_seed(0) - MAX_SEQ_LEN = 1024 - MAX_CTX_LEN = 1024 - BS = 10 - cache_size = 640 - block_size = 32 - max_block_per_request = 64 - subquery_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)] - ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)] - seq_lens = [a + b for a, b in zip(subquery_lens, ctx_lens)] - - num_tokens = sum(subquery_lens) - query = torch.empty(num_tokens, - num_heads, - head_size, - dtype=dtype, - device='cuda') - query.uniform_(-1e-3, 1e-3) - output = torch.empty(num_tokens, - num_heads, - head_size, - dtype=dtype, - device='cuda') - - kv = torch.empty(sum(seq_lens), - 2, - num_heads, - head_size, - dtype=dtype, - device='cuda') - kv.uniform_(-1e-3, 1e-3) - key, value = kv.unbind(dim=1) - - k_cache = torch.zeros(cache_size, - block_size, - num_heads, - head_size, - dtype=dtype, - device='cuda') - v_cache = torch.zeros(cache_size, - block_size, - num_heads, - head_size, - dtype=dtype, - device='cuda') - k = torch.zeros(sum(subquery_lens), - num_heads, - head_size, - dtype=dtype, - device='cuda') - v = torch.zeros(sum(subquery_lens), - num_heads, - head_size, - dtype=dtype, - device='cuda') - values = torch.arange(0, cache_size, dtype=torch.long, device='cuda') - values = values[torch.randperm(cache_size)] - block_table = values[:BS * max_block_per_request].view( - BS, max_block_per_request) - b_seq_len = torch.tensor(seq_lens, dtype=torch.long, device='cuda') - b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long, device='cuda') - b_start_loc = torch.cumsum(torch.tensor([0] + subquery_lens[:-1], - dtype=torch.long, - device='cuda'), - dim=0) - max_input_len = MAX_SEQ_LEN - # copy kv to cache - b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1], - dtype=torch.long, - device='cuda'), - dim=0) - for i in range(BS): - for j in range(subquery_lens[i]): - k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + - j]) - v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + - b_ctx_len[i] + j]) - cur_ctx = 0 - block_id = 0 - while cur_ctx < b_ctx_len[i]: - start_loc = b_seq_start_loc[i] + cur_ctx - if cur_ctx + block_size > b_ctx_len[i]: - end_loc = b_seq_start_loc[i] + b_ctx_len[i] - else: - end_loc = start_loc + block_size - start_slot = block_table[i, block_id] * block_size - end_slot = start_slot + end_loc - start_loc - k_cache.view(-1, num_heads, head_size)[start_slot:end_slot].copy_( - key[start_loc:end_loc]) - v_cache.view(-1, num_heads, head_size)[start_slot:end_slot].copy_( - value[start_loc:end_loc]) - cur_ctx += block_size - block_id += 1 - # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size] - # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8] - k_cache = k_cache.view(-1, block_size, num_heads, head_size // 8, - 8).permute(0, 2, 3, 1, 4).contiguous() - # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size] - # to V_cache[num_blocks, num_kv_heads, head_size, block_size] - v_cache = v_cache.view(-1, block_size, num_heads, - head_size).permute(0, 2, 3, 1).contiguous() - - # Warm up the Triton kernel by calling it once before actually measuring generation time - context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table, - b_start_loc, b_seq_len, b_ctx_len, max_input_len) - torch.cuda.synchronize() - start_time = time.time() - context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table, - b_start_loc, b_seq_len, b_ctx_len, max_input_len) - torch.cuda.synchronize() - end_time = time.time() - print(f"triton Time: {(end_time - start_time)*1000:.2f} ms") - - scale = float(1.0 / (head_size**0.5)) - - attn_op = xops.fmha.cutlass.FwOp() - - attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens( - subquery_lens, seq_lens) - output_ref = xops.memory_efficient_attention_forward( - query.unsqueeze(0), - key.unsqueeze(0), - value.unsqueeze(0), - attn_bias=attn_bias, - p=0.0, - scale=scale, - op=attn_op, - ) - torch.cuda.synchronize() - start_time = time.time() - output_ref = xops.memory_efficient_attention_forward( - query.unsqueeze(0), - key.unsqueeze(0), - value.unsqueeze(0), - attn_bias=attn_bias, - p=0.0, - scale=scale, - op=attn_op, - ) - torch.cuda.synchronize() - end_time = time.time() - print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms") - output_ref = output_ref.squeeze(0) - assert torch.allclose(output_ref, output, atol=1e-6, rtol=0) diff --git a/tests/lora/__init__.py b/tests/lora/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py deleted file mode 100644 index c1b3d04c713b5a02d3fbdb31228fed4485c01e9b..0000000000000000000000000000000000000000 --- a/tests/lora/conftest.py +++ /dev/null @@ -1,143 +0,0 @@ -import contextlib -import gc -import tempfile -from collections import OrderedDict -from unittest.mock import patch, MagicMock - -import pytest -import ray -import torch -import torch.nn as nn -from huggingface_hub import snapshot_download - -import vllm -from vllm.config import LoRAConfig -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.model_loader import get_model -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.parallel_utils.parallel_state import ( - destroy_model_parallel, initialize_model_parallel) - - -def cleanup(): - destroy_model_parallel() - with contextlib.suppress(AssertionError): - torch.distributed.destroy_process_group() - gc.collect() - torch.cuda.empty_cache() - ray.shutdown() - - -@pytest.fixture(autouse=True) -def cleanup_fixture(): - yield - cleanup() - - -@pytest.fixture -def dist_init(): - if not torch.distributed.is_initialized(): - temp_file = tempfile.mkstemp()[1] - torch.distributed.init_process_group( - backend="nccl", - world_size=1, - rank=0, - init_method=f"file://{temp_file}", - ) - torch.distributed.all_reduce(torch.zeros(1).cuda()) - initialize_model_parallel(1, 1) - yield - cleanup() - - -@pytest.fixture -def dist_init_torch_only(): - if torch.distributed.is_initialized(): - return - temp_file = tempfile.mkstemp()[1] - torch.distributed.init_process_group( - backend="nccl", - world_size=1, - rank=0, - init_method=f"file://{temp_file}", - ) - - -@pytest.fixture -def dummy_model() -> nn.Module: - model = nn.Sequential( - OrderedDict([ - ("dense1", ColumnParallelLinear(764, 100)), - ("dense2", RowParallelLinear(100, 50)), - ( - "layer1", - nn.Sequential( - OrderedDict([ - ("dense1", ColumnParallelLinear(100, 10)), - ("dense2", RowParallelLinear(10, 50)), - ])), - ), - ("act2", nn.ReLU()), - ("output", ColumnParallelLinear(50, 10)), - ("outact", nn.Sigmoid()), - # Special handling for lm_head & sampler - ("lm_head", ParallelLMHead(512, 10)), - ("sampler", Sampler(512)) - ])) - model.config = MagicMock() - return model - - -@pytest.fixture -def dummy_model_gate_up() -> nn.Module: - model = nn.Sequential( - OrderedDict([ - ("dense1", ColumnParallelLinear(764, 100)), - ("dense2", RowParallelLinear(100, 50)), - ( - "layer1", - nn.Sequential( - OrderedDict([ - ("dense1", ColumnParallelLinear(100, 10)), - ("dense2", RowParallelLinear(10, 50)), - ])), - ), - ("act2", nn.ReLU()), - ("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])), - ("outact", nn.Sigmoid()), - # Special handling for lm_head & sampler - ("lm_head", ParallelLMHead(512, 10)), - ("sampler", Sampler(512)) - ])) - model.config = MagicMock() - return model - - -@pytest.fixture(scope="session") -def sql_lora_files(): - return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") - - -@pytest.fixture -def llama_2_7b_engine_extra_embeddings() -> nn.Module: - cleanup() - get_model_old = get_model - - def get_model_patched(model_config, lora_config=None): - return get_model_old(model_config, - LoRAConfig(max_loras=4, max_lora_rank=8)) - - with patch("vllm.worker.model_runner.get_model", get_model_patched): - engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) - yield engine.llm_engine - del engine - cleanup() - - -@pytest.fixture -def llama_2_7b_model_extra_embeddings( - llama_2_7b_engine_extra_embeddings) -> nn.Module: - yield llama_2_7b_engine_extra_embeddings.driver_worker.model_runner.model diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py deleted file mode 100644 index 71c671132205afbb48062460d5a2226991cb7ca7..0000000000000000000000000000000000000000 --- a/tests/lora/test_layers.py +++ /dev/null @@ -1,709 +0,0 @@ -import pytest -import random -from copy import deepcopy -from dataclasses import dataclass -from typing import List, Optional, Dict, Tuple - -import torch -import torch.nn.functional as F - -from vllm.lora.layers import ( - ColumnParallelLinearWithLoRA, - MergedColumnParallelLinearWithLoRA, - QKVParallelLinearWithLora, - VocabParallelEmbeddingWithLoRA, - RowParallelLinearWithLoRA, - SamplerWithLoRA, - LoRAMapping, - BaseLayerWithLoRA, -) -from vllm.lora.models import LoRALayerWeights, convert_mapping, PackedLoRALayerWeights -from vllm.config import LoRAConfig -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, - RowParallelLinear, - QKVParallelLinear) -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead -from vllm.model_executor.utils import set_random_seed - -from .utils import DummyLoRAManager - -TOLERANCES = { - torch.float16: (5e-3, 5e-3), - torch.float32: (5e-3, 5e-3), - torch.bfloat16: (3e-2, 2e-2), -} - - -def get_random_id_to_index(num_loras: int, - num_slots: int, - log: bool = True) -> List[Optional[int]]: - """Creates a random lora_id_to_index mapping. - - Args: - num_loras: The number of active loras in the mapping. - num_slots: The number of slots in the mapping. Must be larger - than num_loras. - log: Whether to log the output. - """ - - if num_loras > num_slots: - raise ValueError( - f"num_loras is higher than num_slots: {num_loras} > {num_slots}. " - "num_loras must be less than or equal to num_slots.") - - slots: List[Optional[int]] = [None] * num_slots - random_slot_selections = (torch.randperm(num_slots)[:num_loras]).tolist() - for lora_id, slot_idx in enumerate(random_slot_selections, start=1): - slots[slot_idx] = lora_id - - if log: - print(f"Created lora_id_to_index mapping: {slots}.") - - return slots - - -def populate_loras( - id_to_index: List[Optional[int]], - layer: BaseLayerWithLoRA, - layer_weights: torch.Tensor, - generate_embeddings_tensor: int = 0, - repeats: int = 1, -) -> Tuple[Dict[int, LoRALayerWeights], Dict[int, List[LoRALayerWeights]]]: - """This method populates the lora layers with lora weights. - - Args: - id_to_index: a list of lora ids. The index of the lora id - represents which memory slot the lora matrices are - stored in. A None value indicates a free slot. - layer: the LoRAlayer to populate. - layer_weights: the PyTorch tensor containing the layer's - weights. - generate_embeddings_tensor: whether to generate an - embeddings tensor for each LoRA. - repeats: must only be set for column parallel packed - layers. Indicates the number of loras to compose - together to create a single lora layer. - """ - - # Dictionary that maps the lora ID to the - # corresponding lora weights. - lora_dict: Dict[int, LoRALayerWeights] = dict() - - # Dictionary that maps the lora ID to the - # corresponding subloras. Only useful when - # repeats > 1. - sublora_dict: Dict[int, List[LoRALayerWeights]] = dict() - - for slot_idx, lora_id in enumerate(id_to_index): - if lora_id is not None: - subloras = [] - sublora_len = layer_weights.shape[0] // repeats - for i in range(repeats): - sublora = DummyLoRAManager().init_random_lora( - module_name=f"fake_{i}", - weight=layer_weights, - generate_embeddings_tensor=generate_embeddings_tensor, - ) - sublora.lora_b = sublora.lora_b[:, (sublora_len * - i):(sublora_len * (i + 1))] - sublora.optimize() - subloras.append(sublora) - - lora = PackedLoRALayerWeights.pack( - subloras) if repeats > 1 else subloras[0] - - layer.set_lora( - slot_idx, - lora_a=lora.lora_a, - lora_b=lora.lora_b, - embeddings_tensor=lora.embeddings_tensor, - ) - - lora_dict[lora_id] = lora - sublora_dict[lora_id] = subloras - - return lora_dict, sublora_dict - - -def create_random_inputs( - active_lora_ids: List[int], - num_inputs: int, - input_size: Tuple[int, ...], - input_range: Tuple[float, float], - input_type: torch.dtype = torch.int, -) -> Tuple[List[torch.Tensor], List[int], List[int]]: - """Creates random inputs. - - Args: - active_lora_ids: lora IDs of active lora weights. - num_inputs: the number of inputs to create. - input_size: the size of each individual input. - input_range: the range of values to include in the input. - input_range[0] <= possible input values < input_range[1] - input_type: the type of values in the input. - """ - - low, high = input_range - - inputs, index_mapping, prompt_mapping = [], [], [] - for _ in range(num_inputs): - if input_type == torch.int: - inputs.append( - torch.randint(low=int(low), - high=int(high), - size=input_size, - device="cuda")) - else: - inputs.append( - torch.rand(size=input_size, dtype=input_type, device="cuda") * - high + low) - - lora_id = random.choice(active_lora_ids) - index_mapping += [lora_id] * input_size[0] - prompt_mapping += [lora_id] - - return inputs, index_mapping, prompt_mapping - - -@torch.inference_mode() -@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) -def test_embeddings(dist_init, num_loras) -> None: - - max_loras = 8 - lora_config = LoRAConfig(max_loras=max_loras, - max_lora_rank=8, - lora_dtype=torch.float16) - - def create_random_embedding_layer(): - embedding = VocabParallelEmbedding(512, 256) - embedding.weight.data = torch.rand_like(embedding.weight.data) - embedding.weight.data[512:, :] = 0 - lora_embedding = VocabParallelEmbeddingWithLoRA(embedding) - lora_embedding.create_lora_weights(max_loras, lora_config) - - return embedding, lora_embedding - - for i in range(10): - set_random_seed(i) - - id_to_index = get_random_id_to_index(num_loras, max_loras) - embedding, lora_embedding = create_random_embedding_layer() - - lora_dict, _ = populate_loras( - id_to_index, - layer=lora_embedding, - layer_weights=embedding.weight.T, - ) - - inputs, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=list(lora_dict.keys()), - num_inputs=num_loras * 3, - input_size=(200, ), - input_range=(1, 512), - ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - - mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras, - 512, lora_config.lora_extra_vocab_size) - lora_embedding.set_mapping(*mapping_info) - - lora_result = lora_embedding(torch.cat(inputs)) - - expected_results = [] - for input_, lora_id in zip(inputs, prompt_mapping): - lora = lora_dict[lora_id] - result = embedding(input_) - after_a = F.embedding( - input_, - lora.lora_a, - ) - result += (after_a @ lora.lora_b) - expected_results.append(result) - expected_result = torch.cat(expected_results) - - rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) - - # Check that resetting the lora weights succeeds - - for slot_idx in range(max_loras): - lora_embedding.reset_lora(slot_idx) - - inputs, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=[0], - num_inputs=num_loras * 3, - input_size=(200, ), - input_range=(1, 512), - ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - - mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras, - 512, lora_config.lora_extra_vocab_size) - lora_embedding.set_mapping(*mapping_info, ) - - lora_result = lora_embedding(torch.cat(inputs)) - expected_result = embedding(torch.cat(inputs)) - - rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) - - -@torch.inference_mode() -# @pytest.mark.skip(reason="Fails when loras are in any slot other than the first.") -@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) -def test_embeddings_with_new_embeddings(dist_init, num_loras) -> None: - - max_loras = 8 - lora_config = LoRAConfig(max_loras=max_loras, - max_lora_rank=8, - lora_dtype=torch.float16) - - def create_random_embedding_layer(): - embedding = VocabParallelEmbedding(512, 256) - embedding_data = torch.rand_like(embedding.weight.data) - embedding.weight.data = embedding_data - embedding.weight.data[512:, :] = 0 - expanded_embedding = VocabParallelEmbedding( - 512 + lora_config.lora_extra_vocab_size * max_loras, - 256, - org_num_embeddings=512) - expanded_embedding.weight.data[:512, :] = embedding_data - # We need to deepcopy the embedding as it will be modifed - # in place - lora_embedding = VocabParallelEmbeddingWithLoRA( - deepcopy(expanded_embedding)) - lora_embedding.create_lora_weights(max_loras, lora_config) - - return expanded_embedding, lora_embedding - - for i in range(10): - set_random_seed(i) - - id_to_index = get_random_id_to_index(num_loras, max_loras) - expanded_embedding, lora_embedding = create_random_embedding_layer() - lora_dict, _ = populate_loras( - id_to_index, - layer=lora_embedding, - layer_weights=torch.zeros( - (256, 512 + lora_config.lora_extra_vocab_size)), - generate_embeddings_tensor=256, - ) - - # All embeddings tensors have the same shape. - embeddings_tensors = [ - lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys()) - ] - embeddings_tensor_len = embeddings_tensors[0].shape[0] - - # Add empty embeddings_tensors for unoccupied lora slots. - for _ in range(max_loras - len(embeddings_tensors)): - embeddings_tensors.append( - torch.zeros(embeddings_tensors[0].shape, device="cuda")) - - inputs, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=list(lora_dict.keys()), - num_inputs=num_loras * 3, - input_size=(200, ), - input_range=(1, 512), - ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - - original_inputs = deepcopy(inputs) - - # Force some of the inputs to be in the extended embeddings range - # to guarantee that their behavior is tested. - for input_, original_input_, lora_id in zip(inputs, original_inputs, - prompt_mapping): - embedding_id = lora_id - 1 - input_[-1] = 512 + (embedding_id * embeddings_tensor_len) - original_input_[-1] = 512 - input_[-2] = 512 + ((embedding_id + 1) * embeddings_tensor_len - 1) - original_input_[-2] = 512 + embeddings_tensor_len - 1 - - mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras, - 512, lora_config.lora_extra_vocab_size) - lora_embedding.set_mapping(*mapping_info, ) - - expanded_embedding.weight[512:512 + - (embeddings_tensor_len * - max_loras)] = torch.cat(embeddings_tensors) - - lora_result = lora_embedding(torch.cat(original_inputs)) - - expected_results = [] - for input_, original_input_, lora_id in zip(inputs, original_inputs, - prompt_mapping): - lora = lora_dict[lora_id] - result = expanded_embedding(input_) - after_a = F.embedding( - original_input_, - lora.lora_a, - ) - result += (after_a @ lora.lora_b) - expected_results.append(result) - expected_result = torch.cat(expected_results) - - rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) - - # Check that resetting the lora weights succeeds - - for slot_idx in range(max_loras): - lora_embedding.reset_lora(slot_idx) - - inputs, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=[0], - num_inputs=num_loras * 3, - input_size=(200, ), - input_range=(1, 512), - ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - - original_inputs = deepcopy(inputs) - - mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras, - 512, lora_config.lora_extra_vocab_size) - lora_embedding.set_mapping(*mapping_info, ) - - lora_result = lora_embedding(torch.cat(original_inputs)) - expected_result = expanded_embedding(torch.cat(inputs)) - - rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) - - -@torch.inference_mode() -@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) -def test_lm_head_sampler(dist_init, num_loras) -> None: - - max_loras = 8 - lora_config = LoRAConfig(max_loras=max_loras, - max_lora_rank=8, - lora_dtype=torch.float16) - - def create_random_sampler_layer(): - linear = ParallelLMHead(32000 + lora_config.lora_extra_vocab_size, - 1024, 32000) - linear.weight.data = torch.rand_like(linear.weight.data) - linear.weight.data[:, 32000:] = 0 - sampler = Sampler(32000 + lora_config.lora_extra_vocab_size, 32000) - lora_sampler = SamplerWithLoRA(sampler, 1024, linear.weight.dtype, - linear.weight.device) - lora_sampler.create_lora_weights(max_loras, lora_config) - - return linear, sampler, lora_sampler - - for i in range(10): - set_random_seed(i) - - id_to_index = get_random_id_to_index(num_loras, max_loras) - linear, sampler, lora_sampler = create_random_sampler_layer() - - # NOTE: all the generated loras share the same embeddings tensor. - lora_dict, _ = populate_loras( - id_to_index, - layer=lora_sampler, - layer_weights=linear.weight, - generate_embeddings_tensor=1024, - ) - embeddings_tensor = list(lora_dict.values())[0].embeddings_tensor - embeddings_tensor_len = embeddings_tensor.shape[0] - - inputs, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=list(lora_dict.keys()), - num_inputs=8 * num_loras, # * 3, - input_size=(1, 1024), - input_range=(0, 1), - input_type=torch.float32, - ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - - input_ = torch.rand(20, 1024, device="cuda") - mapping_info = convert_mapping( - lora_mapping, - id_to_index, - max_loras, - 32000, - lora_config.lora_extra_vocab_size, - ) - lora_sampler.set_mapping(*mapping_info, ) - - lora_result = lora_sampler._get_logits(hidden_states=torch.cat(inputs), - embedding=linear.weight, - embedding_bias=None) - - original_weight = linear.weight.clone() - - linear.weight[sampler.org_vocab_size:sampler.org_vocab_size + - embeddings_tensor_len] = embeddings_tensor - - sampler.org_vocab_size = 32000 + lora_config.lora_extra_vocab_size - expected_results = [] - for input_, lora_id in zip(inputs, prompt_mapping): - lora = lora_dict[lora_id] - result = sampler._get_logits(hidden_states=input_, - embedding=linear.weight, - embedding_bias=None) - result[:, 32000 + embeddings_tensor_len:] = float("-inf") - result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling - expected_results.append(result) - expected_result = torch.cat(expected_results) - sampler.org_vocab_size = 32000 - - # Check that resetting the lora weights succeeds - - for slot_idx in range(max_loras): - lora_sampler.reset_lora(slot_idx) - - inputs, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=[0], - num_inputs=8 * num_loras * 3, - input_size=(1, 1024), - input_range=(0, 1), - input_type=torch.float32, - ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - - mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras, - 32000, - lora_config.lora_extra_vocab_size) - lora_sampler.set_mapping(*mapping_info, ) - - lora_result = lora_sampler._get_logits(hidden_states=torch.cat(inputs), - embedding=original_weight, - embedding_bias=None)[:, :32000] - expected_result = sampler._get_logits(hidden_states=torch.cat(inputs), - embedding=original_weight, - embedding_bias=None) - - rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) - - -@torch.inference_mode() -@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) -@pytest.mark.parametrize("orientation", ["row", "column"]) -def test_linear_parallel(dist_init, num_loras, orientation) -> None: - - max_loras = 8 - lora_config = LoRAConfig(max_loras=max_loras, - max_lora_rank=8, - lora_dtype=torch.float16) - - def create_random_linear_parallel_layer(): - if orientation == "row": - linear = RowParallelLinear(4096, 4096, bias=False) - linear.weight.data = torch.rand_like(linear.weight.data) - lora_linear = RowParallelLinearWithLoRA(linear) - else: - linear = ColumnParallelLinear(4096, 4096, bias=False) - linear.weight.data = torch.rand_like(linear.weight.data) - lora_linear = ColumnParallelLinearWithLoRA(linear) - lora_linear.create_lora_weights(max_loras, lora_config) - - return linear, lora_linear - - for i in range(10): - set_random_seed(i) - - id_to_index = get_random_id_to_index(num_loras, max_loras) - linear, lora_linear = create_random_linear_parallel_layer() - - lora_dict, _ = populate_loras( - id_to_index, - layer=lora_linear, - layer_weights=linear.weight, - ) - - inputs, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=list(lora_dict.keys()), - num_inputs=32 * num_loras, - input_size=(1, 4096), - input_range=(0, 1), - input_type=torch.float32, - ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - - mapping_info = convert_mapping( - lora_mapping, - id_to_index, - max_loras, - 512, - lora_config.lora_extra_vocab_size, - ) - lora_linear.set_mapping(*mapping_info, ) - - lora_result = lora_linear(torch.cat(inputs))[0] - - expected_results = [] - for input_, lora_id in zip(inputs, prompt_mapping): - lora = lora_dict[lora_id] - result = linear(input_)[0] - result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling - expected_results.append(result) - expected_result = torch.cat(expected_results) - - rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) - - # Check that resetting the lora weights succeeds - - for slot_idx in range(max_loras): - lora_linear.reset_lora(slot_idx) - - inputs, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=[0], - num_inputs=32 * num_loras, - input_size=(1, 4096), - input_range=(0, 1), - input_type=torch.float32, - ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - - mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras, - 512, lora_config.lora_extra_vocab_size) - lora_linear.set_mapping(*mapping_info, ) - - lora_result = lora_linear(torch.cat(inputs))[0] - expected_result = linear(torch.cat(inputs))[0] - - rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) - - -@torch.inference_mode() -@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) -@pytest.mark.parametrize("repeats", [2, 3]) -def test_column_parallel_packed(dist_init, num_loras, repeats) -> None: - - max_loras = 8 - lora_config = LoRAConfig(max_loras=max_loras, - max_lora_rank=8, - lora_dtype=torch.float16) - - def create_column_parallel_packed_layer(): - if repeats == 2: - linear = MergedColumnParallelLinear(4096, [4096] * repeats, - bias=False) - linear.weight.data = torch.rand_like(linear.weight.data) - lora_linear = MergedColumnParallelLinearWithLoRA(linear) - else: - linear = QKVParallelLinear(4096, 64, 32, bias=False) - linear.weight.data = torch.rand_like(linear.weight.data) - lora_linear = QKVParallelLinearWithLora(linear) - - @dataclass - class FakeConfig: - hidden_size = 4096 - num_key_value_heads = 32 - num_attention_heads = 32 - - lora_linear.create_lora_weights(max_loras, - lora_config, - model_config=FakeConfig()) - - return linear, lora_linear - - for i in range(10): - set_random_seed(i) - - id_to_index = get_random_id_to_index(num_loras, max_loras) - - linear, lora_linear = create_column_parallel_packed_layer() - - lora_dict, sublora_dict = populate_loras( - id_to_index, - layer=lora_linear, - layer_weights=linear.weight, - repeats=repeats, - ) - - inputs, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=list(lora_dict.keys()), - num_inputs=32 * num_loras, - input_size=(1, 4096), - input_range=(0, 1), - input_type=torch.float32, - ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - - mapping_info = convert_mapping( - lora_mapping, - id_to_index, - max_loras, - 512, - lora_config.lora_extra_vocab_size, - ) - lora_linear.set_mapping(*mapping_info) - - lora_result = lora_linear(torch.cat(inputs))[0] - - expected_results = [] - for input_, lora_id in zip(inputs, prompt_mapping): - result = linear(input_)[0] - subloras = sublora_dict[lora_id] - for i, sublora in enumerate(subloras): - result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] * ( - i + 1 - )] += input_ @ sublora.lora_a @ sublora.lora_b * sublora.scaling - expected_results.append(result) - expected_result = torch.cat(expected_results) - - rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) - - for slot_idx in range(max_loras): - lora_linear.reset_lora(slot_idx) - - inputs, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=[0], - num_inputs=32 * num_loras, - input_size=(1, 4096), - input_range=(0, 1), - input_type=torch.float32, - ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping) - - mapping_info = convert_mapping( - lora_mapping, - id_to_index, - max_loras, - 512, - lora_config.lora_extra_vocab_size, - ) - lora_linear.set_mapping(*mapping_info) - - lora_result = lora_linear(torch.cat(inputs))[0] - expected_result = linear(torch.cat(inputs))[0] - - rtol, atol = TOLERANCES[lora_result.dtype] - assert torch.allclose(lora_result, - expected_result, - rtol=rtol, - atol=atol) diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py deleted file mode 100644 index 06fbf19eea824fbbd699c4a417b0ad61556e5721..0000000000000000000000000000000000000000 --- a/tests/lora/test_llama.py +++ /dev/null @@ -1,144 +0,0 @@ -import pytest -import ray - -import vllm -from vllm.lora.request import LoRARequest -from .conftest import cleanup - -MODEL_PATH = "meta-llama/Llama-2-7b-hf" - - -def do_sample(llm, lora_path: str, lora_id: int): - prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]" - ] - sampling_params = vllm.SamplingParams(temperature=0, - max_tokens=256, - stop=["[/assistant]"]) - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None) - # Print the outputs. - generated_texts = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - return generated_texts - - -@pytest.mark.parametrize("tp_size", [1]) -def test_llama_lora(sql_lora_files, tp_size): - # Cannot use as it will initialize torch.cuda too early... - # if torch.cuda.device_count() < tp_size: - # pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") - - llm = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=tp_size) - - expected_no_lora_output = [ - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", - "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", - ] - expected_lora_output = [ - " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", - " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", - " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", - " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", - " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", - " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " - ] - - print("lora adapter created") - assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output - - print("lora 1") - assert do_sample(llm, sql_lora_files, lora_id=1) == expected_lora_output - - print("no lora") - assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output - - print("lora 2") - assert do_sample(llm, sql_lora_files, lora_id=2) == expected_lora_output - - print("removing lora") - - -@pytest.mark.skip("Requires multiple GPUs") -def test_llama_tensor_parallel_equality(sql_lora_files): - # Cannot use as it will initialize torch.cuda too early... - # if torch.cuda.device_count() < 4: - # pytest.skip(f"Not enough GPUs for tensor parallelism {4}") - - llm_tp1 = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=1) - output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1) - - del llm_tp1 - cleanup() - - llm_tp2 = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=2) - output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1) - - del llm_tp2 - cleanup() - - assert output_tp1 == output_tp2 - - llm_tp4 = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=4) - output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1) - - del llm_tp4 - cleanup() - - assert output_tp1 == output_tp4 - - -def test_llama_lora_warmup(sql_lora_files): - """Test that the LLM initialization works with a warmup LORA path and is more conservative""" - - @ray.remote(num_gpus=1) - def get_num_gpu_blocks_lora(): - llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16) - num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks - return num_gpu_blocks_lora_warmup - - @ray.remote(num_gpus=1) - def get_num_gpu_blocks_no_lora(): - llm = vllm.LLM(MODEL_PATH, max_num_seqs=16) - num_gpu_blocks_no_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks - return num_gpu_blocks_no_lora_warmup - - num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote()) - num_gpu_blocks_no_lora_warmup = ray.get( - get_num_gpu_blocks_no_lora.remote()) - assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, ( - "The warmup with lora should be more" - " conservative than without lora, therefore the number of memory blocks for the KV cache should be " - "less when using lora than when not using lora") diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py deleted file mode 100644 index 3415d36b7e3419ab11235d0d03e3ab8f818b468b..0000000000000000000000000000000000000000 --- a/tests/lora/test_lora.py +++ /dev/null @@ -1,224 +0,0 @@ -import pytest -import torch - -from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice - -from .utils import DummyLoRAManager - -TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4] -QKV_TENSOR_SIZES = [ - (8192, 1024, 1024), - (8192 // 8, 1024 // 8, 1024 // 8), - (4096, 4096, 4096), - (4096 // 2, 4096 // 2, 4096 // 2), -] -BATCH_SIZES = [8, 32, 256] -RANKS = [8] -DTYPES = [torch.float16] -TOLERANCES = { - torch.float16: (5e-3, 5e-3), - torch.bfloat16: (3e-2, 2e-2), -} - - -@pytest.mark.parametrize("m", TENSOR_SIZES) -@pytest.mark.parametrize("n", TENSOR_SIZES) -@pytest.mark.parametrize("k", BATCH_SIZES) -@pytest.mark.parametrize("rank", RANKS) -@pytest.mark.parametrize("dtype", DTYPES) -def test_apply_lora(m, n, k, rank, dtype) -> None: - manager = DummyLoRAManager() - - module_name = "module" - weight = torch.rand([m, n], device="cuda", dtype=dtype) - - manager.init_random_lora(module_name, weight, rank=rank) - lora = manager.get_module_lora(module_name) - - input = torch.rand(k, n, device="cuda", dtype=dtype) - expected = input @ lora.lora_a @ lora.lora_b * lora.scaling - - lora_a_stack = torch.zeros(8, - 1, - lora.lora_a.shape[1], - lora.lora_a.shape[0], - device="cuda", - dtype=dtype) - lora_b_stack = torch.zeros(8, - 1, - lora.lora_b.shape[1], - lora.lora_b.shape[0], - device="cuda", - dtype=dtype) - for i in range(lora_a_stack.shape[0]): - lora_a_stack[i][0] = lora.lora_a.T - lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T - - output = torch.zeros(k, m, device="cuda", dtype=dtype) - _apply_lora( - input, lora_a_stack, lora_b_stack, - torch.randint(0, lora_a_stack.shape[0], (len(input), ), device="cuda"), - output) - - rtol, atol = TOLERANCES[dtype] - assert torch.allclose(expected, output, rtol=rtol, atol=atol) - - output[:] = 0 - _apply_lora(input, lora_a_stack, lora_b_stack, - torch.full((len(input), ), -1, device="cuda"), output) - assert torch.allclose(torch.zeros_like(output), output) - - manager.reset_lora() - - -@pytest.mark.parametrize("m", TENSOR_SIZES) -@pytest.mark.parametrize("n", TENSOR_SIZES) -@pytest.mark.parametrize("k", BATCH_SIZES) -@pytest.mark.parametrize("rank", RANKS) -@pytest.mark.parametrize("dtype", DTYPES) -def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: - if m % 2 != 0: - pytest.skip("m must be divisible by 2") - if m // 2 not in TENSOR_SIZES: - pytest.skip("m//2 must be in TENSOR_SIZES") - - manager = DummyLoRAManager() - - module_name = "module" - weight = torch.rand([m // 2, n], device="cuda", dtype=dtype) - - manager.init_random_lora(module_name + "1", weight, rank=rank) - lora_1 = manager.get_module_lora(module_name + "1") - manager.init_random_lora(module_name + "2", weight, rank=rank) - lora_2 = manager.get_module_lora(module_name + "2") - - input = torch.rand(k, n, device="cuda", dtype=dtype) - expected = torch.cat([ - input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling, - input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling - ], - dim=1) - - lora_a_stacks = [ - torch.zeros(8, - 1, - lora_1.lora_a.shape[1], - lora_1.lora_a.shape[0], - device="cuda", - dtype=dtype) for i in range(2) - ] - lora_b_stacks = [ - torch.zeros(8, - 1, - lora_1.lora_b.shape[1], - lora_1.lora_b.shape[0], - device="cuda", - dtype=dtype) for i in range(2) - ] - for i in range(lora_a_stacks[0].shape[0]): - lora_a_stacks[0][i][0] = lora_1.lora_a.T - lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T - lora_a_stacks[1][i][0] = lora_2.lora_a.T - lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T - - output = torch.zeros(k, m, device="cuda", dtype=dtype) - _apply_lora_packed_nslice( - input, lora_a_stacks, lora_b_stacks, - torch.randint(0, - lora_a_stacks[0].shape[0], (len(input), ), - device="cuda"), output, (m // 2, m // 2)) - - rtol, atol = TOLERANCES[dtype] - assert torch.allclose(expected, output, rtol=rtol, atol=atol) - - output[:] = 0 - _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, - torch.full((len(input), ), -1, device="cuda"), - output, (m // 2, m // 2)) - assert torch.allclose(torch.zeros_like(output), output) - - manager.reset_lora() - - -@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES) -@pytest.mark.parametrize("n", TENSOR_SIZES) -@pytest.mark.parametrize("k", BATCH_SIZES) -@pytest.mark.parametrize("rank", RANKS) -@pytest.mark.parametrize("dtype", DTYPES) -def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: - manager = DummyLoRAManager() - - module_name = "module" - weight_q = torch.empty(qkv[0], n, device="cuda", dtype=dtype) - weight_kv = torch.empty(qkv[1], n, device="cuda", dtype=dtype) - - manager.init_random_lora(module_name + "q", weight_q, rank=rank) - lora_q = manager.get_module_lora(module_name + "q") - manager.init_random_lora(module_name + "k", weight_kv, rank=rank) - lora_k = manager.get_module_lora(module_name + "k") - manager.init_random_lora(module_name + "v", weight_kv, rank=rank) - lora_v = manager.get_module_lora(module_name + "v") - - input = torch.rand(k, n, device="cuda", dtype=dtype) - expected = torch.cat([ - input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling, - input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling, - input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling - ], - dim=1) - - lora_a_stacks = [ - torch.zeros(8, - 1, - lora_q.lora_a.shape[1], - lora_q.lora_a.shape[0], - device="cuda", - dtype=dtype) - ] + [ - torch.zeros(8, - 1, - lora_k.lora_a.shape[1], - lora_k.lora_a.shape[0], - device="cuda", - dtype=dtype) for i in range(2) - ] - lora_b_stacks = [ - torch.zeros(8, - 1, - lora_q.lora_b.shape[1], - lora_q.lora_b.shape[0], - device="cuda", - dtype=dtype) - ] + [ - torch.zeros(8, - 1, - lora_k.lora_b.shape[1], - lora_k.lora_b.shape[0], - device="cuda", - dtype=dtype) for i in range(2) - ] - for i in range(lora_a_stacks[0].shape[0]): - lora_a_stacks[0][i][0] = lora_q.lora_a.T - lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T - lora_a_stacks[1][i][0] = lora_k.lora_a.T - lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T - lora_a_stacks[2][i][0] = lora_v.lora_a.T - lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T - - output = torch.zeros(k, sum(qkv), device="cuda", dtype=dtype) - _apply_lora_packed_nslice( - input, lora_a_stacks, lora_b_stacks, - torch.randint(0, - lora_a_stacks[0].shape[0], (len(input), ), - device="cuda"), output, (qkv[0], qkv[1], qkv[2])) - - rtol, atol = TOLERANCES[dtype] - assert torch.allclose(expected, output, rtol=rtol, atol=atol) - - output[:] = 0 - _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, - torch.full((len(input), ), -1, device="cuda"), - output, (qkv[0], qkv[1], qkv[2])) - assert torch.allclose(torch.zeros_like(output), output) - - manager.reset_lora() diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py deleted file mode 100644 index 78a4a5bc5ecd239992a704b6b497e9b4d912c8ad..0000000000000000000000000000000000000000 --- a/tests/lora/test_lora_manager.py +++ /dev/null @@ -1,475 +0,0 @@ -import os -from typing import List - -import pytest -import torch -from safetensors.torch import load_file -from torch import nn - -from vllm.config import LoRAConfig -from vllm.lora.layers import (ColumnParallelLinearWithLoRA, - RowParallelLinearWithLoRA, - MergedColumnParallelLinearWithLoRA) -from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights -from vllm.lora.models import (EMBEDDING_MODULES, LoRAModel, LoRAModelManager, - LRUCacheLoRAModelManager, LoRAMapping) -from vllm.lora.request import LoRARequest -from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager, - WorkerLoRAManager) -from vllm.model_executor.layers.linear import RowParallelLinear - - -def test_from_lora_tensors(sql_lora_files): - tensors = load_file( - os.path.join(sql_lora_files, "adapter_model.safetensors")) - new_embeddings = load_file( - os.path.join(sql_lora_files, "new_embeddings.safetensors")) - lora_model = LoRAModel.from_lora_tensors(1, - 8, - 16, - tensors, - "cuda", - embeddings=new_embeddings) - for module_name, lora in lora_model.loras.items(): - assert lora.module_name == module_name - assert lora.rank == 8 - assert lora.lora_alpha == 16 - assert lora.lora_a is not None - assert lora.lora_b is not None - assert (lora.lora_a.shape[1] == lora.lora_b.shape[0] - ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}" - assert lora.lora_a.shape[1] == 8 - embeddings_module = next( - (k for k in EMBEDDING_MODULES if k in module_name), None) - if embeddings_module: - assert torch.equal( - lora.embeddings_tensor, - new_embeddings[EMBEDDING_MODULES[embeddings_module]].to( - device=lora.embeddings_tensor.device)) - else: - assert lora.embeddings_tensor is None - - -def create_lora(lora_id: int, model: nn.Module, - sub_modules: List[str]) -> LoRAModel: - loras = {} - for name in sub_modules: - w = model.get_submodule(name).weight - loras[name] = LoRALayerWeights( - name, - 8, - 16, - torch.rand([w.shape[1], 8], device="cuda"), - torch.rand([8, w.shape[0]], device="cuda"), - ) - return LoRAModel(lora_id, 8, loras) - - -def create_packed_lora( - lora_id: int, - model: nn.Module, - module_name, - replaced_module_names, - empty_replaced_module_name=None, -) -> LoRAModel: - w = model.get_submodule(module_name).weight - loras = {} - for replaced_module_name in replaced_module_names: - if replaced_module_name == empty_replaced_module_name: - continue - loras[replaced_module_name] = LoRALayerWeights( - replaced_module_name, - 8, - 16, - torch.rand([w.shape[1], 8], device="cuda"), - torch.rand([8, w.shape[0] // len(replaced_module_names)], - device="cuda"), - ) - return LoRAModel(lora_id, 8, loras) - - -def test_replace_submodules(dist_init, dummy_model): - model = dummy_model - manager = LoRAModelManager(model, - 1, - 1, - 1, - LoRAConfig(max_lora_rank=8, - max_cpu_loras=8, - max_loras=8), - lora_target_modules=["dense1", "layer1.dense2"]) - model = manager.model - - assert isinstance(model.get_submodule("dense1"), - ColumnParallelLinearWithLoRA) - assert isinstance(model.get_submodule("layer1.dense1"), - ColumnParallelLinearWithLoRA) - assert isinstance(model.get_submodule("dense2"), RowParallelLinear) - assert isinstance(model.get_submodule("layer1.dense2"), - RowParallelLinearWithLoRA) - - -def test_lora_model_manager(dist_init, dummy_model): - model = dummy_model - model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) - model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) - model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) - manager = LoRAModelManager( - model, - 2, - 2, - 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2), - lora_target_modules=["dense1", "dense2", "lm_head"]) - assert all(x is None for x in manager.lora_index_to_id) - assert manager.add_lora(model_lora1) - assert manager.activate_lora(1) - assert manager.lora_index_to_id[0] == 1 - assert not manager.add_lora(model_lora1) - assert not manager.activate_lora(1) - assert manager.add_lora(model_lora2) - assert manager.activate_lora(2) - assert manager.lora_index_to_id[0] == 1 - assert manager.lora_index_to_id[1] == 2 - assert not manager.add_lora(model_lora2) - assert not manager.activate_lora(2) - assert manager.add_lora(model_lora3) - assert manager.lora_index_to_id[0] == 1 - assert manager.lora_index_to_id[1] == 2 - with pytest.raises(ValueError): - assert manager.activate_lora(3) - assert manager.lora_index_to_id[0] == 1 - assert manager.lora_index_to_id[1] == 2 - assert manager.remove_lora(model_lora2.id) - assert manager.lora_index_to_id[1] is None - assert not manager.remove_lora(model_lora2.id) - assert manager.remove_lora(model_lora1.id) - assert not manager.remove_lora(model_lora1.id) - assert manager.add_lora(model_lora1) - assert manager.lora_index_to_id[0] is None - assert manager.lora_index_to_id[1] is None - assert manager.add_lora(model_lora2) - assert manager.activate_lora(3) - assert manager.lora_index_to_id[0] == 3 - assert manager.lora_index_to_id[1] is None - assert manager.activate_lora(2) - assert manager.lora_index_to_id[0] == 3 - assert manager.lora_index_to_id[1] == 2 - - -def test_lora_lru_cache_model_manager(dist_init, dummy_model): - model = dummy_model - model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) - model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) - model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) - manager = LRUCacheLoRAModelManager( - model, - 2, - 2, - 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2), - lora_target_modules=["dense1", "dense2", "lm_head"]) - assert all(x is None for x in manager.lora_index_to_id) - assert manager.add_lora(model_lora1) - assert manager.activate_lora(1) - assert manager.lora_index_to_id[0] == 1 - assert not manager.add_lora(model_lora1) - assert not manager.activate_lora(1) - assert manager.add_lora(model_lora2) - assert manager.activate_lora(2) - assert manager.lora_index_to_id[0] == 1 - assert manager.lora_index_to_id[1] == 2 - assert not manager.add_lora(model_lora2) - assert not manager.activate_lora(2) - assert manager.add_lora(model_lora3) - assert manager.lora_index_to_id[0] == 1 - assert manager.lora_index_to_id[1] == 2 - assert manager.activate_lora(3) - assert manager.lora_index_to_id[0] == 3 - assert manager.lora_index_to_id[1] == 2 - assert manager.remove_lora(model_lora2.id) - assert manager.lora_index_to_id[1] is None - assert not manager.remove_lora(model_lora2.id) - assert manager.remove_lora(model_lora1.id) - assert not manager.remove_lora(model_lora1.id) - assert manager.add_lora(model_lora1) - assert manager.activate_lora(1) - assert manager.lora_index_to_id[0] == 3 - assert manager.lora_index_to_id[1] == 1 - assert manager.add_lora(model_lora2) - assert manager.deactivate_lora(3) - assert manager.lora_index_to_id[0] is None - assert manager.lora_index_to_id[1] == 1 - assert manager.activate_lora(2) - assert manager.lora_index_to_id[0] == 2 - assert manager.lora_index_to_id[1] == 1 - assert manager.activate_lora(3) - assert manager.lora_index_to_id[0] == 2 - assert manager.lora_index_to_id[1] == 3 - - -def test_lru_lora_model_manager(dist_init, dummy_model): - # This tests just the LRU cache functionality, everything else is - # tested in test_lora_model_manager - model = dummy_model - model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) - model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) - model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) - model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"]) - manager = LRUCacheLoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2), - ["dense1", "dense2", "lm_head"]) - - assert all(x is None for x in manager.lora_index_to_id) - - # Add up to capacity - assert manager.add_lora(model_lora1) - assert manager.add_lora(model_lora2) - assert manager.activate_lora(1) - assert manager.activate_lora(2) - - assert set(manager.list_loras()) == {1, 2} - assert manager.lora_index_to_id[0] == 1 - assert manager.lora_index_to_id[1] == 2 - - # Add over capacity - assert manager.add_lora(model_lora3) - assert manager.add_lora(model_lora4) - assert manager.activate_lora(3) - assert manager.activate_lora(4) - - assert set(manager.list_loras()) == {3, 4} - assert manager.lora_index_to_id[0] == 3 - assert manager.lora_index_to_id[1] == 4 - - # Add 3 again to move it to the top and then add 2 - # should return false since it's in already - assert not manager.add_lora(model_lora3) - assert not manager.activate_lora(3) - assert manager.add_lora(model_lora2) - assert manager.activate_lora(2) - - assert set(manager.list_loras()) == {3, 2} - assert manager.lora_index_to_id[0] == 3 - assert manager.lora_index_to_id[1] == 2 - - # Remove manually - assert manager.remove_lora(3) - assert not manager.remove_lora(3) - - assert set(manager.list_loras()) == {2} - assert manager.lora_index_to_id[0] is None - assert manager.lora_index_to_id[1] == 2 - - assert manager.add_lora(model_lora3) - assert manager.activate_lora(3) - assert manager.add_lora(model_lora4) - assert manager.activate_lora(4) - - assert set(manager.list_loras()) == {3, 4} - assert manager.lora_index_to_id[0] == 3 - assert manager.lora_index_to_id[1] == 4 - - assert manager.remove_oldest_lora() - assert set(manager.list_loras()) == {4} - assert manager.lora_index_to_id[0] is None - assert manager.lora_index_to_id[1] == 4 - - assert manager.remove_oldest_lora() - assert set(manager.list_loras()) == set() - assert all(x is None for x in manager.lora_index_to_id) - - assert not manager.remove_oldest_lora() - assert set(manager.list_loras()) == set() - assert all(x is None for x in manager.lora_index_to_id) - - -def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings, - sql_lora_files): - lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) - worker_lora_manager = LRUCacheWorkerLoRAManager( - 4, 2, llama_2_7b_model_extra_embeddings.config.vocab_size, lora_config, - torch.device("cuda")) - worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings) - - mapping = LoRAMapping([], []) - worker_lora_manager.set_active_loras([ - LoRARequest("1", 1, sql_lora_files), - LoRARequest("2", 2, sql_lora_files) - ], mapping) - assert worker_lora_manager.list_loras() == {1, 2} - assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1 - assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2 - - worker_lora_manager.set_active_loras([ - LoRARequest("1", 1, sql_lora_files), - LoRARequest("3", 3, sql_lora_files), - LoRARequest("4", 4, sql_lora_files) - ], mapping) - assert worker_lora_manager.list_loras() == {1, 2, 3, 4} - assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1 - assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2 - assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 3 - assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4 - - worker_lora_manager.set_active_loras([ - LoRARequest("1", 1, sql_lora_files), - LoRARequest("2", 2, sql_lora_files), - LoRARequest("5", 5, sql_lora_files) - ], mapping) - assert worker_lora_manager.list_loras() == {1, 2, 4, 5} - assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1 - assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2 - assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5 - assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4 - - worker_lora_manager.set_active_loras([ - LoRARequest("1", 1, sql_lora_files), - LoRARequest("1", 1, sql_lora_files), - LoRARequest("1", 1, sql_lora_files) - ], mapping) - assert worker_lora_manager.list_loras() == {1, 2, 4, 5} - assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1 - assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2 - assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5 - assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4 - - worker_lora_manager.set_active_loras([ - LoRARequest("6", 6, sql_lora_files), - LoRARequest("7", 7, sql_lora_files), - LoRARequest("8", 8, sql_lora_files) - ], mapping) - assert worker_lora_manager.list_loras() == {1, 6, 7, 8} - assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1 - assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 7 - assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 8 - assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 6 - - # Over capacity - with pytest.raises(RuntimeError): - worker_lora_manager.set_active_loras([ - LoRARequest("10", 10, sql_lora_files), - LoRARequest("11", 11, sql_lora_files), - LoRARequest("12", 12, sql_lora_files), - LoRARequest("13", 13, sql_lora_files), - LoRARequest("14", 14, sql_lora_files) - ], mapping) - - -def test_worker_lora_manager(llama_2_7b_model_extra_embeddings, - sql_lora_files): - # Should remove every LoRA not specified in the request. - lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) - worker_lora_manager = WorkerLoRAManager( - 4, 2, llama_2_7b_model_extra_embeddings.config.vocab_size, lora_config, - torch.device("cuda")) - worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings) - - mapping = LoRAMapping([], []) - worker_lora_manager.set_active_loras([ - LoRARequest("1", 1, sql_lora_files), - LoRARequest("2", 2, sql_lora_files) - ], mapping) - assert worker_lora_manager.list_loras() == {1, 2} - assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1 - assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2 - - worker_lora_manager.set_active_loras([ - LoRARequest("1", 1, sql_lora_files), - LoRARequest("3", 3, sql_lora_files), - LoRARequest("4", 4, sql_lora_files) - ], mapping) - assert worker_lora_manager.list_loras() == {1, 3, 4} - assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1 - assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 3 - assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 4 - - worker_lora_manager.set_active_loras([ - LoRARequest("1", 1, sql_lora_files), - LoRARequest("2", 2, sql_lora_files), - LoRARequest("5", 5, sql_lora_files) - ], mapping) - assert worker_lora_manager.list_loras() == {1, 2, 5} - assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1 - assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2 - assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5 - - worker_lora_manager.set_active_loras([ - LoRARequest("1", 1, sql_lora_files), - LoRARequest("1", 1, sql_lora_files), - LoRARequest("1", 1, sql_lora_files) - ], mapping) - assert worker_lora_manager.list_loras() == {1} - assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1 - assert worker_lora_manager._lora_manager.lora_index_to_id[1] is None - assert worker_lora_manager._lora_manager.lora_index_to_id[2] is None - - worker_lora_manager.set_active_loras([ - LoRARequest("6", 6, sql_lora_files), - LoRARequest("7", 7, sql_lora_files), - LoRARequest("8", 8, sql_lora_files) - ], mapping) - assert worker_lora_manager.list_loras() == {6, 7, 8} - assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 8 - assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 6 - assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 7 - - # Over capacity - with pytest.raises(RuntimeError): - worker_lora_manager.set_active_loras([ - LoRARequest("10", 10, sql_lora_files), - LoRARequest("11", 11, sql_lora_files), - LoRARequest("12", 12, sql_lora_files), - LoRARequest("13", 13, sql_lora_files), - LoRARequest("14", 14, sql_lora_files) - ], mapping) - - -def test_packed_loras(dist_init, dummy_model_gate_up): - model = dummy_model_gate_up - model_lora = create_packed_lora( - 1, - model, - module_name="gate_up_proj", - replaced_module_names=["gate_proj", "up_proj"]) - model_lora1 = create_packed_lora( - 2, - model, - module_name="gate_up_proj", - replaced_module_names=["gate_proj", "up_proj"], - empty_replaced_module_name="gate_proj", - ) - - manager = LoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2), - ["gate_up_proj"]) - model = manager.model - - assert isinstance(model.get_submodule("gate_up_proj"), - MergedColumnParallelLinearWithLoRA) - assert manager.add_lora(model_lora) - assert manager.add_lora(model_lora1) - - packed_lora = model_lora.get_lora("gate_up_proj") - assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights) - - assert torch.allclose(packed_lora.lora_a[0], - model_lora.get_lora("gate_proj").lora_a) - assert torch.allclose(packed_lora.lora_b[0], - model_lora.get_lora("gate_proj").lora_b) - assert torch.allclose(packed_lora.lora_a[1], - model_lora.get_lora("up_proj").lora_a) - assert torch.allclose(packed_lora.lora_b[1], - model_lora.get_lora("up_proj").lora_b) - - packed_lora1 = model_lora1.get_lora("gate_up_proj") - assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights) - - assert packed_lora1.lora_a[0] is None - assert packed_lora1.lora_b[0] is None - assert torch.allclose(packed_lora1.lora_a[1], - model_lora1.get_lora("up_proj").lora_a) - assert torch.allclose(packed_lora1.lora_b[1], - model_lora1.get_lora("up_proj").lora_b) diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py deleted file mode 100644 index 903814faa5dc7b914992cb45b5269c7662fc4407..0000000000000000000000000000000000000000 --- a/tests/lora/test_punica.py +++ /dev/null @@ -1,175 +0,0 @@ -# Based on code from https://github.com/punica-ai/punica - -import pytest -import torch - -import vllm.lora.punica as punica - - -def assert_close(a, b): - rtol, atol = { - torch.float16: (5e-3, 5e-3), - torch.bfloat16: (3e-2, 2e-2), - torch.float32: (None, None), - }[a.dtype] - torch.testing.assert_close(a, b, rtol=rtol, atol=atol) - - -def _lora_ref_impl( - y_final: torch.Tensor, - x: torch.Tensor, - wa_T_all: torch.Tensor, - wb_T_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, -): - y_stage_1 = torch.empty( - (x.size(0), wa_T_all.size(-2)), - dtype=torch.float32, - device=x.device, - ) - bs = x.shape[0] - s = torch.tensor(scale, dtype=torch.float32, device=x.device) - for i, lora_idx in zip(range(bs), indicies.cpu().tolist()): - xi = x[i].unsqueeze(0).to(torch.float32) - wa = wa_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32) - wb = wb_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32) - - tmp = xi @ wa - y_stage_1[i] = tmp.squeeze(0) - y_final[i] += (tmp @ wb).squeeze(0) * s - return y_final, y_stage_1 - - -H1 = H2 = [ - 128, 256, 512, 1024, 1280, 2048, 2560, 2752, 3072, 3456, 3584, 4096, 5120, - 5504, 5632, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336, 32000, - 32256, 32512, 32768, 33024 -] -SEED = [0xabcdabcd987] - - -@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"]) -@pytest.mark.parametrize("h1", H1) -@pytest.mark.parametrize("h2", H2) -@pytest.mark.parametrize("seed", SEED) -@torch.inference_mode() -def test_lora_correctness(dtype_str, h1, h2, seed): - torch.manual_seed(seed) - num_loras = 4 - num_layers = 1 - r = 8 - bs = 32 - scale = 0.123 - dtype = getattr(torch, dtype_str) - device = torch.device("cuda") - - wa_T_all = torch.randn(num_loras, - num_layers, - r, - h1, - dtype=dtype, - device=device) - wb_T_all = torch.randn(num_loras, - num_layers, - h2, - r, - dtype=dtype, - device=device) - indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device) - - for layer_idx in range(num_layers): - x = torch.randn(bs, h1, dtype=dtype, device=device) - y = torch.randn(bs, h2, dtype=dtype, device=device) - - y_ref = y.clone() - _lora_ref_impl(y_ref, x, wa_T_all, wb_T_all, indices, layer_idx, scale) - - y_our = y.clone() - punica.add_lora(y_our, x, wa_T_all, wb_T_all, indices, layer_idx, - scale) - - assert_close(y_ref, y_our) - - -@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"]) -@pytest.mark.parametrize("h1", H1) -@pytest.mark.parametrize("h2", H2) -@pytest.mark.parametrize("seed", SEED) -@torch.inference_mode() -def test_lora_correctness_slice(dtype_str, h1, h2, seed): - if h2 % 3 != 0 or h2 // 3 not in H1: - pytest.skip("h2 must be divisible by 3 and in supported shapes") - torch.manual_seed(seed) - num_loras = 4 - num_layers = 1 - r = 8 - bs = 32 - scale = 0.123 - dtype = getattr(torch, dtype_str) - device = torch.device("cuda") - - wa_T_all_0 = torch.randn(num_loras, - num_layers, - r, - h1, - dtype=dtype, - device=device) - wa_T_all_1 = torch.randn(num_loras, - num_layers, - r, - h1, - dtype=dtype, - device=device) - wa_T_all_2 = torch.randn(num_loras, - num_layers, - r, - h1, - dtype=dtype, - device=device) - wb_T_all_0 = torch.randn(num_loras, - num_layers, - h2 // 3, - r, - dtype=dtype, - device=device) - wb_T_all_1 = torch.randn(num_loras, - num_layers, - h2 // 3, - r, - dtype=dtype, - device=device) - wb_T_all_2 = torch.randn(num_loras, - num_layers, - h2 // 3, - r, - dtype=dtype, - device=device) - - indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device) - - for layer_idx in range(num_layers): - x = torch.randn(bs, h1, dtype=dtype, device=device) - y = torch.randn(bs, h2, dtype=dtype, device=device) - s = h2 // 3 - - y_ref = y.clone() - _lora_ref_impl(y_ref[:, :s], x, wa_T_all_0, wb_T_all_0, indices, - layer_idx, scale) - _lora_ref_impl(y_ref[:, s:s * 2], x, wa_T_all_1, wb_T_all_1, indices, - layer_idx, scale) - _lora_ref_impl(y_ref[:, s * 2:], x, wa_T_all_2, wb_T_all_2, indices, - layer_idx, scale) - - y_our = y.clone() - punica.add_lora_slice(y_our, x, wa_T_all_0, wb_T_all_0, indices, - layer_idx, scale, 0, s) - punica.add_lora_slice(y_our, x, wa_T_all_1, wb_T_all_1, indices, - layer_idx, scale, s, s) - punica.add_lora_slice(y_our, x, wa_T_all_2, wb_T_all_2, indices, - layer_idx, scale, s * 2, s) - - assert_close(y_ref[:, :s], y_our[:, :s]) - assert_close(y_ref[:, s:s * 2], y_our[:, s:s * 2]) - assert_close(y_ref[:, s * 2:], y_our[:, s * 2:]) diff --git a/tests/lora/test_tokenizer.py b/tests/lora/test_tokenizer.py deleted file mode 100644 index 6c4c91fce81272d251f6fd9dc59b5927235bc2d6..0000000000000000000000000000000000000000 --- a/tests/lora/test_tokenizer.py +++ /dev/null @@ -1,69 +0,0 @@ -import pytest -from transformers import AutoTokenizer, PreTrainedTokenizerBase - -from vllm.lora.request import LoRARequest -from vllm.transformers_utils.tokenizer import TokenizerGroup, get_lora_tokenizer - - -@pytest.mark.asyncio -async def test_transformers_tokenizer(): - reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") - tokenizer = TokenizerGroup( - tokenizer_id="gpt2", - enable_lora=False, - max_num_seqs=1, - max_input_length=None, - ) - assert reference_tokenizer.encode("prompt") == tokenizer.encode( - request_id="request_id", prompt="prompt", lora_request=None) - assert reference_tokenizer.encode( - "prompt") == await tokenizer.encode_async(request_id="request_id", - prompt="prompt", - lora_request=None) - assert isinstance(tokenizer.get_lora_tokenizer(None), - PreTrainedTokenizerBase) - assert tokenizer.get_lora_tokenizer( - None) == await tokenizer.get_lora_tokenizer_async(None) - - -@pytest.mark.asyncio -async def test_transformers_tokenizer_lora(sql_lora_files): - reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files) - tokenizer = TokenizerGroup( - tokenizer_id="gpt2", - enable_lora=True, - max_num_seqs=1, - max_input_length=None, - ) - lora_request = LoRARequest("1", 1, sql_lora_files) - assert reference_tokenizer.encode("prompt") == tokenizer.encode( - request_id="request_id", prompt="prompt", lora_request=lora_request) - assert reference_tokenizer.encode( - "prompt") == await tokenizer.encode_async(request_id="request_id", - prompt="prompt", - lora_request=lora_request) - assert isinstance(tokenizer.get_lora_tokenizer(None), - PreTrainedTokenizerBase) - assert tokenizer.get_lora_tokenizer( - None) == await tokenizer.get_lora_tokenizer_async(None) - - assert isinstance(tokenizer.get_lora_tokenizer(lora_request), - PreTrainedTokenizerBase) - assert tokenizer.get_lora_tokenizer( - lora_request) != tokenizer.get_lora_tokenizer(None) - assert tokenizer.get_lora_tokenizer( - lora_request) == await tokenizer.get_lora_tokenizer_async(lora_request) - - -def test_get_lora_tokenizer(sql_lora_files, tmpdir): - lora_request = None - tokenizer = get_lora_tokenizer(lora_request) - assert not tokenizer - - lora_request = LoRARequest("1", 1, sql_lora_files) - tokenizer = get_lora_tokenizer(lora_request) - assert tokenizer.get_added_vocab() - - lora_request = LoRARequest("1", 1, str(tmpdir)) - tokenizer = get_lora_tokenizer(lora_request) - assert not tokenizer diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py deleted file mode 100644 index 2996322f4aa48766da68cfabbeff488cfd9ba13d..0000000000000000000000000000000000000000 --- a/tests/lora/test_utils.py +++ /dev/null @@ -1,172 +0,0 @@ -from collections import OrderedDict - -from torch import nn - -from vllm.utils import LRUCache -from vllm.lora.utils import (parse_fine_tuned_lora_name, replace_submodule) - - -def test_parse_fine_tuned_lora_name(): - fixture = { - ("base_model.model.lm_head.lora_A.weight", "lm_head", True), - ("base_model.model.lm_head.lora_B.weight", "lm_head", False), - ( - "base_model.model.model.embed_tokens.lora_embedding_A", - "model.embed_tokens", - True, - ), - ( - "base_model.model.model.embed_tokens.lora_embedding_B", - "model.embed_tokens", - False, - ), - ( - "base_model.model.model.layers.9.mlp.down_proj.lora_A.weight", - "model.layers.9.mlp.down_proj", - True, - ), - ( - "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight", - "model.layers.9.mlp.down_proj", - False, - ), - } - for name, module_name, is_lora_a in fixture: - assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name) - - -def test_replace_submodule(): - model = nn.Sequential( - OrderedDict([ - ("dense1", nn.Linear(764, 100)), - ("act1", nn.ReLU()), - ("dense2", nn.Linear(100, 50)), - ( - "seq1", - nn.Sequential( - OrderedDict([ - ("dense1", nn.Linear(100, 10)), - ("dense2", nn.Linear(10, 50)), - ])), - ), - ("act2", nn.ReLU()), - ("output", nn.Linear(50, 10)), - ("outact", nn.Sigmoid()), - ])) - - sigmoid = nn.Sigmoid() - - replace_submodule(model, "act1", sigmoid) - assert dict(model.named_modules())["act1"] == sigmoid - - dense2 = nn.Linear(1, 5) - replace_submodule(model, "seq1.dense2", dense2) - assert dict(model.named_modules())["seq1.dense2"] == dense2 - - -class TestLRUCache(LRUCache): - - def _on_remove(self, key, value): - if not hasattr(self, "_remove_counter"): - self._remove_counter = 0 - self._remove_counter += 1 - - -def test_lru_cache(): - cache = TestLRUCache(3) - - cache.put(1, 1) - assert len(cache) == 1 - - cache.put(1, 1) - assert len(cache) == 1 - - cache.put(2, 2) - assert len(cache) == 2 - - cache.put(3, 3) - assert len(cache) == 3 - assert set(cache.cache) == {1, 2, 3} - - cache.put(4, 4) - assert len(cache) == 3 - assert set(cache.cache) == {2, 3, 4} - assert cache._remove_counter == 1 - assert cache.get(2) == 2 - - cache.put(5, 5) - assert set(cache.cache) == {2, 4, 5} - assert cache._remove_counter == 2 - - assert cache.pop(5) == 5 - assert len(cache) == 2 - assert set(cache.cache) == {2, 4} - assert cache._remove_counter == 3 - - cache.pop(10) - assert len(cache) == 2 - assert set(cache.cache) == {2, 4} - assert cache._remove_counter == 3 - - cache.get(10) - assert len(cache) == 2 - assert set(cache.cache) == {2, 4} - assert cache._remove_counter == 3 - - cache.put(6, 6) - assert len(cache) == 3 - assert set(cache.cache) == {2, 4, 6} - assert 2 in cache - assert 4 in cache - assert 6 in cache - - cache.remove_oldest() - assert len(cache) == 2 - assert set(cache.cache) == {2, 6} - assert cache._remove_counter == 4 - - cache.clear() - assert len(cache) == 0 - assert cache._remove_counter == 6 - - cache._remove_counter = 0 - - cache[1] = 1 - assert len(cache) == 1 - - cache[1] = 1 - assert len(cache) == 1 - - cache[2] = 2 - assert len(cache) == 2 - - cache[3] = 3 - assert len(cache) == 3 - assert set(cache.cache) == {1, 2, 3} - - cache[4] = 4 - assert len(cache) == 3 - assert set(cache.cache) == {2, 3, 4} - assert cache._remove_counter == 1 - assert cache[2] == 2 - - cache[5] = 5 - assert set(cache.cache) == {2, 4, 5} - assert cache._remove_counter == 2 - - del cache[5] - assert len(cache) == 2 - assert set(cache.cache) == {2, 4} - assert cache._remove_counter == 3 - - cache.pop(10) - assert len(cache) == 2 - assert set(cache.cache) == {2, 4} - assert cache._remove_counter == 3 - - cache[6] = 6 - assert len(cache) == 3 - assert set(cache.cache) == {2, 4, 6} - assert 2 in cache - assert 4 in cache - assert 6 in cache diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py deleted file mode 100644 index 68c2c0b5fc1340f58b14ce0a6527686769e373f6..0000000000000000000000000000000000000000 --- a/tests/lora/test_worker.py +++ /dev/null @@ -1,61 +0,0 @@ -import os -import random -import tempfile -from unittest.mock import patch - -from vllm.lora.models import LoRAMapping -from vllm.lora.request import LoRARequest -from vllm.config import ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig -from vllm.worker.worker import Worker - - -@patch.dict(os.environ, {"RANK": "0"}) -def test_worker_apply_lora(sql_lora_files): - worker = Worker( - model_config=ModelConfig( - "meta-llama/Llama-2-7b-hf", - "meta-llama/Llama-2-7b-hf", - tokenizer_mode="auto", - trust_remote_code=False, - download_dir=None, - load_format="dummy", - seed=0, - dtype="float16", - revision=None, - ), - parallel_config=ParallelConfig(1, 1, False), - scheduler_config=SchedulerConfig(32, 32, 32, 256), - local_rank=0, - rank=0, - lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32, - max_loras=32), - distributed_init_method=f"file://{tempfile.mkstemp()[1]}", - ) - worker.init_model() - worker.load_model() - - worker.model_runner.set_active_loras([], LoRAMapping([], [])) - assert worker.list_loras() == set() - - n_loras = 32 - lora_requests = [ - LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras) - ] - - worker.model_runner.set_active_loras(lora_requests, LoRAMapping([], [])) - assert worker.list_loras() == { - lora_request.lora_int_id - for lora_request in lora_requests - } - - for i in range(32): - random.seed(i) - iter_lora_requests = random.choices(lora_requests, - k=random.randint(1, n_loras)) - random.shuffle(iter_lora_requests) - iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)] - worker.model_runner.set_active_loras(iter_lora_requests, - LoRAMapping([], [])) - assert worker.list_loras().issuperset( - {lora_request.lora_int_id - for lora_request in iter_lora_requests}) diff --git a/tests/lora/utils.py b/tests/lora/utils.py deleted file mode 100644 index 280e0f2043e68d2ba031e54e8431718dc90056c3..0000000000000000000000000000000000000000 --- a/tests/lora/utils.py +++ /dev/null @@ -1,88 +0,0 @@ -from typing import List, Optional - -import torch - -from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights - - -class DummyLoRAManager: - - def __init__(self): - super().__init__() - self._loras = {} - - def set_module_lora(self, module_name: str, lora: LoRALayerWeights): - self._loras[module_name] = lora - - def get_module_lora(self, module_name: str) -> Optional[LoRALayerWeights]: - return self._loras.get(module_name, None) - - def init_random_lora(self, - module_name: str, - weight: torch.Tensor, - rank: int = 8, - generate_embeddings_tensor: int = 0): - lora = LoRALayerWeights( - module_name, - rank=rank, - lora_alpha=1, - lora_a=torch.rand([weight.shape[1], rank], - dtype=weight.dtype, - device="cuda"), - lora_b=torch.rand([rank, weight.shape[0]], - dtype=weight.dtype, - device="cuda"), - ) - if generate_embeddings_tensor: - lora.embeddings_tensor = torch.rand(5, - generate_embeddings_tensor, - dtype=weight.dtype, - device="cuda") - self.set_module_lora(module_name, lora) - - return lora - - def init_lora(self, - module_name: str, - input_dim: int, - output_dim: int, - rank=8, - noop=False, - embeddings_tensor=None): - lora = LoRALayerWeights( - module_name, - rank=rank, - lora_alpha=1, - lora_a=torch.rand([input_dim, rank], device="cuda"), - lora_b=torch.rand([rank, output_dim], device="cuda"), - embeddings_tensor=embeddings_tensor, - ) - self.set_module_lora(module_name, lora) - return lora - - def reset_lora(self): - self._loras = {} - - def init_packed_lora( - self, - module_name: str, - input_dim: int, - output_dims: List[int], - noop_lora_index: List[int] = None, - rank=8, - ): - base_loras = [] - noop_lora_index = set(noop_lora_index or []) - - for i, out_dim in enumerate(output_dims): - base_lora = self.init_lora( - module_name + "_000_" + str(i), - input_dim, - out_dim, - rank=rank, - noop=i in noop_lora_index, - ) - base_loras.append(base_lora) - packed_lora = PackedLoRALayerWeights.pack(base_loras) - self.set_module_lora(module_name, packed_lora) - return packed_lora diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py deleted file mode 100644 index 83316fcb7469d924973b5cbbf63aa989311e5bf6..0000000000000000000000000000000000000000 --- a/tests/models/test_mistral.py +++ /dev/null @@ -1,37 +0,0 @@ -"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling. - -Run `pytest tests/models/test_mistral.py --forked`. -""" -import pytest - -MODELS = [ - "mistralai/Mistral-7B-Instruct-v0.1", -] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [128]) -def test_models( - hf_runner, - vllm_runner, - example_long_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - hf_model = hf_runner(model, dtype=dtype) - hf_outputs = hf_model.generate_greedy(example_long_prompts, max_tokens) - del hf_model - - vllm_model = vllm_runner(model, dtype=dtype) - vllm_outputs = vllm_model.generate_greedy(example_long_prompts, max_tokens) - del vllm_model - - for i in range(len(example_long_prompts)): - hf_output_ids, hf_output_str = hf_outputs[i] - vllm_output_ids, vllm_output_str = vllm_outputs[i] - assert hf_output_str == vllm_output_str, ( - f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") - assert hf_output_ids == vllm_output_ids, ( - f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") diff --git a/tests/models/test_models.py b/tests/models/test_models.py deleted file mode 100644 index 40858a517b311e1ea754c41ecd679343be093df4..0000000000000000000000000000000000000000 --- a/tests/models/test_models.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Compare the outputs of HF and vLLM when using greedy sampling. - -Run `pytest tests/models/test_models.py --forked`. -""" -import pytest - -MODELS = [ - "facebook/opt-125m", "meta-llama/Llama-2-7b-hf", - "mistralai/Mistral-7B-v0.1", "Deci/DeciLM-7b", "tiiuae/falcon-7b", "gpt2", - "bigcode/tiny_starcoder_py", "EleutherAI/gpt-j-6b", - "EleutherAI/pythia-70m", "bigscience/bloom-560m", "mosaicml/mpt-7b", - "microsoft/phi-2", "stabilityai/stablelm-3b-4e1t" -] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [128]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - hf_model = hf_runner(model, dtype=dtype) - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - del hf_model - - vllm_model = vllm_runner(model, dtype=dtype) - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - del vllm_model - - for i in range(len(example_prompts)): - hf_output_ids, hf_output_str = hf_outputs[i] - vllm_output_ids, vllm_output_str = vllm_outputs[i] - assert hf_output_str == vllm_output_str, ( - f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") - assert hf_output_ids == vllm_output_ids, ( - f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py deleted file mode 100644 index 1e301bedfc21e004179df61945db53eeea844876..0000000000000000000000000000000000000000 --- a/tests/prefix_caching/test_prefix_caching.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Compare the with and without prefix caching. - -Run `pytest tests/prefix_caching/test_prefix_caching.py`. -""" -import pytest - -from vllm import LLM, SamplingParams - -prefix = ( - "You are an expert school principal, skilled in effectively managing " - "faculty and staff. Draft 10-15 questions for a potential first grade " - "Head Teacher for my K-12, all-girls', independent school that emphasizes " - "community, joyful discovery, and life-long learning. The candidate is " - "coming in for a first-round panel interview for a 8th grade Math " - "teaching role. They have 5 years of previous teaching experience " - "as an assistant teacher at a co-ed, public school with experience " - "in middle school math teaching. Based on these information, fulfill " - "the following paragraph: ") - - -@pytest.mark.parametrize("model", ["facebook/opt-125m"]) -@pytest.mark.parametrize("max_tokens", [16]) -def test_prefix_caching( - example_prompts, - model: str, - max_tokens: int, -): - llm = LLM(model=model) - # -1 since the last token can change when concatenating prompts. - prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1 - prompts = [prefix + prompt for prompt in example_prompts] - sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) - outputs_without_prefix = llm.generate(prompts, sampling_params) - outputs_with_prefix = llm.generate(prompts, - sampling_params, - prefix_pos=[prefix_pos] * len(prompts)) - for output_without_prefix, output_with_prefix in zip( - outputs_without_prefix, outputs_with_prefix): - assert (output_without_prefix.outputs[0].token_ids == - output_with_prefix.outputs[0].token_ids) - assert len(llm.llm_engine.scheduler.prefix_pool.prefixes) == 1 diff --git a/tests/prompts/example.txt b/tests/prompts/example.txt deleted file mode 100644 index e1b97bc6eee7582f79303565fbf2242fd9f78d71..0000000000000000000000000000000000000000 --- a/tests/prompts/example.txt +++ /dev/null @@ -1,8 +0,0 @@ -vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. -Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020. -Compare and contrast artificial intelligence with human intelligence in terms of processing information. -Describe the basic components of a neural network and how it can be trained. -Write a short story about a robot that dreams for the first time. -Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. -Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. -Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' diff --git a/tests/prompts/summary.txt b/tests/prompts/summary.txt deleted file mode 100644 index 2f947a264ce93c3b1f10b53e8ae15b2ea2de1586..0000000000000000000000000000000000000000 --- a/tests/prompts/summary.txt +++ /dev/null @@ -1 +0,0 @@ -Subtitles: for our annual races at Knockhill Circuit.Today\'s racing comes from the Porsche Carrera Cup Great Britainand the Legends Cars Elite Cup with JLM.It\'s the latter who get us underway with their first race of the day,and joining me in the commentary box is Paul O\'Neill.First race of the day for the Legends.Jonty Norman has drawn pole position,with Matt Knight alongside.Marcus Pett on Row 2 with Daniel Pooley.Declan Burke is next up, and then Tyler Read, on Row 3.He\'s leading the rookie championship at the moment.Chris Needham on Row 4 with Luke Simmons.Andrew Rogerson and Gareth Sheridan on Row 5.Sixth row, Peter Barrable, with Charlie Budd.Row 7, Jack Parker, fourth in the championship right now.Nick Price is next to him.Will Gibson, who looks like he\'s out of the championship contention now,with Oli Schlup alongside.Then Ben McNeice and Flight Lieutenant Matt Isherwood.Robert Barrable, championship leader, he\'s on Row 10.Then Brent Bowie from Kieran Beattie and Nick Bridgeman.Mike Schlup on Row 12, followed by Ryan McLeish,who won the day overall yesterday.Mark Beaty, Row 13, with Andy Bird.Then it\'s Ben Higgins and Nathan Anthony.Connor Mills and Paul Musselle complete Row 15.And completing the grid is James Newbery.Here we go, with Race number 1 of the day,the final day of the first ever Legends Cars Elite Cup with JLM.And on the front row, it\'s Jonty Norman in grey,Matt Knight in black and gold.Coming from third place on the grid is Marcus Pett,who goes left of shot in the gunmetal carto challenge for the lead.Marcus Pett, the man from Boston in Lincolnshire,goes through into lead position.Very definitely a fancied championship runnerbut hasn\'t quite had the rub of the green this weekend.And they all pile into McIntyre\'s for the first time.And this is where we look for driving standards.James Newbery brakes at the back.He\'s got Paul Musselle immediately in front of him.Those two had an interesting battle yesterdayinvolving a little bit of contact, I think,but they\'re both all right at the moment, as they clear the chicane for the first time.Marcus Pett is away.The difference you\'ll see in Legends Cars racing todayis that for this meeting,the bump drafting that we\'ve seen in the pasthas been ruled out for this round,and it\'s under review for the future.But look at the battle for second position, three wide,as Marcus Pett comes in front of the crowds here.Matt Knight on the inside, Dan Pooley on the outside in 32.Dan Pooley challenging for third. He had a strong day yesterday -he was up in the top ten, which was great to see.The man from March.That third car there, eclipsed at the moment,comes out of the slipstream.Dan repaired his own car after Croft,and that of Kieran Beaty,so I know Kieran wanted to thank him for that. He\'s been working hard.And Pooley side by side with Matt Knight.We\'ve got the 13, Chris Needham car, up there in the mix as well.The three top guys in the...Ryan McLeish getting very sideways there,the Scot in the 71 car.The first time we\'ve seen him on our ITV coverage.He\'s not a guest driver this week.I suppose you could technically call him a guest,but he\'s fully championship registeredand took a splendid win yesterday - overall win and race win.Overall on points.Sorry, Paul, gets a chance to get you in.That\'s Jack Parker!Oh, what\'s happened there?So, this was the start. They\'re all still warming the tyres up,ready for the lights to go green,which they do... around about now.And they get going.And then there was a car, wasn\'t there?Oh, I tell you what, that could\'ve ended up really nastyas it snaked up the grass.Yeah, I\'ll tell you what, the moment when the lights went outwas when Marcus Pett broke ranks.That was a very, very meticulous start from Marcus Pett.The blue car here is Tyler Read, top rookie,who looks like he\'s going down the inside of Daniel Pooley,so he\'s gonna make a space here.So, Dan Pooley has lost second position.It\'s Marcus Pett still out front. Matt Knight...I was saying to the drivers,"Don\'t go away if you\'re in the lead because you won\'t get any coverage." Pett\'s down the road, isn\'t he? Look at the gap he\'s got. Yeah.He\'s got three seconds. It\'s gonna be more than that.What I was quite concerned about was the damp part of the circuitdown at the hairpin, where you need to be down the inside of peopleto get the braking done,but these guys seem to be all respecting...Not track limits, but they\'re respecting each other around usbecause I was quite concerned about coming here,but this is quite synonymous with Legends racing at Knockhill.And look at this now. Knight has got...Look at that. I remember Marcus getting his first race win,which was at Snetterton years ago.It\'s always fantastic to see a first-time winner.And Tyler Read is giving him a great workout.Matt Knight back in third.It\'s between the top two at the moment. Oh! Tyler goes wide.He\'s throwing the car around.Marcus Pett, looking a little bit smoother in the 79,was very frustrated yesterday, but Read\'s all over him.Yeah, but look at this now.You\'ve got third, fourth, fifth and sixth.This is gonna be absolutely spectacular!Tyler Read\'s gone! What\'s gone on?!Oh, has the Treherne engine gone pop? He\'s lost a lot of ground.Is he gonna come back into it?Now it\'s Knight having a go on the outside line again.Matt Knight can\'t do it. He runs out wide.Oli Schlup\'s coming through.Schlup hasn\'t had a win yet in Legends cars, so he\'s queueing up.They\'re coming onto the last lap.This could be a key moment for Oli Schlup,who\'s back in third in the K-Seal car.Across the line.Marcus Pett soaking up the pressure brilliantly so far.But does he need to be in front as they come onto the last lap?I don\'t know, but I think Read must have missed a gear,as someone\'s exited stage left.Look at that, back in the mix!It\'s now six for the lead. Can Pett hold on?Championship leader Robert Barrablehas come through from about three rows from the back,and he\'s at the back of the train.Barrable here is gonna extend his championship leadand start towards the front of the grid for Race 2.Barrable, the Irishman, he\'s there.The white car with the green and orange stripeson the nose cone of the car.But it\'s Marcus Pett out front at the moment... Oh!Matt Isherwood\'s rejoined at the back in the black and green.Isherwood\'s got back at them. Matt Knight\'s having a go.Along Railway Straight.Schlup would normally bump draft him. He can\'t do that on the rules.But look at Marcus Pett.Fairly wide-ish line in. Good defensive stuff from Pett.It\'s all about the run up to the hill now.And Marcus Pett is gonna take the win, I think.Here they come, up towards the line. Pett from Matt Knight.It\'s gonna be Matt\'s best resultin the Legends Cars National Championship.Third position goes to Oli Schlup, who is delighted with that.Then it was Tyler Read. Great race from him.Robert Barrable, though...Barrable, from 19th on the grid, without bump drafting,comes through into fifth placeahead of the excellent recovery from Flight Lieutenant Matt Isherwood.Dan Pooley seventh. Another great result for Dan Pooley.So much to take away from those last racing laps.Oh, and those last four lapsis exactly why we have these Legends on the TOCA package.That was exceptional.Marcus Pett looked like a dead cert not to finish first,but congratulations to you. That was brilliant.But Barrable, after exiting stage leftwhen he caught the back of everybody and got right up there...There\'s too much to talk about. Let\'s just talk about this guy.Pett, you are a legend, mate. Well done.Cracking. It is a lad and dad.Literally, Marcus and his dad, Robert, they look after the car.It is lad and dad. We hear that mentioned in other formulas,but genuinely, that is all it is.It is very difficult for drivers like that and teams like thatto come and race on this stage.It is a big thing. And he\'s such a smashing guy.And his dad as well. Really delighted with the win.Super stuff by Matt Knight. brilliant from Oli Schlup.Fantastic as well from Tyler Read.And on the front row,it\'s Jonty Norman in grey, Matt Knight in black and gold.Coming from third place on the grid is Marcus Pett.Bit of a shemozzle at the back.Two cars hooked up, which is not good to see.Oh, has the Treherne engine gone pop? He\'s lost a lot of ground.Now it\'s Knight having a go on the outside line again.Matt Knight can\'t do it. He runs out wide.Oli Schlup\'s coming through.And Marcus Pett is gonna take the win, I think. Pett from Matt Knight. It\'s gonna be Matt\'s best resultin the Legends Cars National Championship.Here\'s how they finished.Marcus Pett takes another win in the Legends Cars Elite Cup with JLM.READS INFOREADS INFOREADS INFOREADS INFOREADS INFOREADS INFOProblems in that race for Ryan McLeish, yesterday\'s winner.Charlie Budd in 30th.And the other driver having problems, obviously,from that first stoppage, Brent Bowie.Marcus, that was a tough racebecause there was a red flag in the middle of it.Actually, the first bit, you got away,but it was a full reset,and pressure throughout to the chequered flag.Yeah, definitely.We had an ideal start and managed to build up a lead early on,which was great, but when you\'re in that position,the last thing you want to see is a red flag. iming line at the end of lap one.So, Gus Burton leads the way.Big, big dive by Foster on the inside,to go back ahead of Wylie.He goes off the road and back on again.He\'s all sideways.And diving up on the outside line comes Ryan Ratcliffe.Wylie here battling with one of the Pro category cars,but behind him, all the Pro-Am opposition crawling all over him.Well, that was dramatic stuff, wasn\'t it?Round the outside of Turn 1, put Harry Foster in the wrong place.That was Max Bird going wide, number 44, the pink and blue car.So that\'s just haemorrhaged places in Pro-Am.And he\'s the... Oh, a puncture.There\'s somebody with a puncture. Is that Angus Whiteside? Possibly.Let\'s see.I think it is. And you\'ve got this damp patch on the inside,on the braking there, just at the final into the hairpin.This has been a dramatic start to this race for Porsches.Absolutely right.Coming up over the timing line, Gus Burton leads the way.Nine tenths of a second to the good.Big effort being made by Jason Lockwoodin the yellow and orange car in the background, look,to try to get up the inside line, then diving down towards Turn 1.Goes ahead of Oliver White, the very experienced Formula 4 champion.In the silver car, Oliver White, back into Carrera Cup.Remember, he did a full season last year.Good to have him back on the grid.As the cars clamber their way up over the kerb,through the chicane.But Gus Burton saying to everybody, "I\'m back." He leads.Yeah, a dramatic way for Gus Burton to come back to this championship.Remember, he started this year with Century Motorsport but then ducked out of the championship prior to Thruxton.He\'s still competing in the Supercup series with Fach Auto.As there in the pits, getting a new rear left tyre, is Angus Whiteside.But Gus Burton absolutely on it.Very quick in testing here during the week.They tested on Wednesday and on Friday.Gus Burton very quick in...And he\'s really enjoying life now.Back in the championship with the NAPA Racing UK supportand with a different team, Nick Tandy\'s JTR outfit.And he\'s done the fastest lap of the race, as he leads.He is not in the championship fight, but he wants to win races.Car off. It\'s Max Bird again.So, Max Bird, the Pro-Am championship leader,three times a winner in class this year,off the road and back on again.But that\'s gonna throw him way, way down the order.This race is going from bad to worse for him.It\'s just completely unfolded for poor Max Bird.That\'s the curse of having our camera on board, I think,but it\'s just unravelled after a great qualifying.Now, you were talking about Gus Burton\'s start,and it is going to be investigated after the race.OK. Well, it\'ll take a lot of camera action analysisto look at it. This is on board with Bird.Round Turn 1.All OK there. Very close... Goes to the outside.That\'s dangerous cos you can get knocked wide,and that\'s exactly what happens.The man he was trying to get past, Josh Stanton,who spent last night trackside at Cowdenbeath watching stock cars.I\'m not suggesting for a moment he\'s learnt how to defend,but he was enjoying himself, watching a different form of racing.I think all the best people were at Cowdenbeath, weren\'t they?Nick Tandy was, and others. Oh!As there, absolutely on the giddy limit, is Harry Foster,making his way in sixth place.Down towards the hairpin.He\'s dropped back from that leading quintet,but he\'s keeping Ross Wylie at bay.Ross Wylie, there, creeping into shot, leads now Pro-Amahead of Ryan Ratcliffe.And Josh Stanton is third in Pro-Am, last year\'s Am champion.Yeah, and Ross Wylie the only Scottish driver in the race. A lot of support for him,from local sponsors as well as the public.Buoyed by his recent run at the British Grand Prix at Supercup,and thoroughly loving racing at his home circuit, Ross Wylie.Track is nicely dry.There was some threats of possible rain.We had rain yesterday during qualifying.They actually only got one runon their slick tyres yesterday in qualifyingbefore the rain arrived, and that set the grid.So, Gus Burton\'s lead growing all the time.1.3 seconds now, that margin over Adam Smalley.As Max Bird tries to fight back in Pro-Am.Gets up the inside line there.So, that puts him ahead of David Stirling.So, he\'s split the second and third Am fightas he tries to recover.Yeah, but he\'s lost a lot of ground with that momenton the outside of McIntyre\'s.It\'s getting a lot darker overhead at Knockhill,even though there is a break in the cloud.A big effort there from the lapped car of Angus Whiteside.He\'s not fighting for position, he\'s trying to unlap himself.But just wonder whether we might get so f the right of McIntyre\'s,up towards Butcher\'s, then the chicane.And looking to try and maintain this 100% recordin the Team Parker Racing-run car in Am.Yeah. David Fairbrother in second place,but some 11 seconds behind in the Am category.But he will take another podium.His second in the championship, too, Justin Sherwood.The race leader 2.5 seconds to the good, Gus Burton.Other battles still to be resolved.What\'s going on in Pro-Am? Ross Wylie leads.He\'s fallen back behind Josh Malin overall. That was the move.Josh Malin through on the inside at the hairpin.Ross Wylie, in a sense, content to let that happen - gave him room -because that\'s not his battle, but what it does meanis that Ryan Ratcliffe, his class rival,is directly behind him.This is William Aspin versus Max Bird for sixth in Pro-Am.And a very determined Max Bird goes one side, get his nose chopped off.Will Aspin, the man from Florence, defends on the other side.They\'re absolutely together, almost touching.Here comes Max Bird.Oh, but he can\'t find a way through there.Angus Whiteside is now getting in on the act.Round the outside goes Max Bird, but they both take it wide,and through goes Angus Whiteside on the inside.Doesn\'t affect the race order.Whiteside unlaps himself from those two cars. Will Aspin stays ahead. Max Bird tries to fight back.Down towards Duffus Dip.Ignore the car in the lead of this battle packbecause it\'s not on the lead lap.But then Aspin under attack.Max Bird tries to get up alongside himfor the inside line coming into McIntyre\'s.He is on the inside, and he is ahead now.Yeah. And behind him, there was a car completely off on the grassafter Turn 1.So I do think that section of the track is a little slippery,for whatever reason. Maybe it just hasn\'t quite dried out.But this was a great battle between Max Bird and Will Aspin.So, drivers, in one or two cases,setting personal best lap times last time around,suggesting that the road is drying still.The cars are getting lighter on fuel anyway.Down at the hairpin comes the recovering Max Bird,as over the line goes Harry Foster, being chased by Josh Malin.Josh up into seventh overall.A top six could be on - he\'s only half a second back.Yeah, it\'s not far away, is it?And still plenty of laps left in this race.You probably noticed through that Turn 1the drivers are not riding the big kerb on the inside.That\'s because it\'s a new kerb that\'s been put in, actually,to raise the level of the kerbback to the level it was before the track got resurfaced twice.But with the resurfacing twice,it had raised the track surface by 80mm,and the drivers found they were, in previous years,able to use that kerb.Now? Not so much.So, there going through is Oliver Wight in the silver car,down towards the hairpin.Jason Lockwood ahead of him.Jason for EXCELR8, and he is running in 12 at the moment,which is potentially going to be his best finish of the year.It\'s been a tough season for Jason,but he could be on for his best results thus far.However, Gus Burton has rather dominated this,and look at the gap that he\'s pulled.Adam Smalley, as we suggested earlier,might be thinking about banking points,but it doesn\'t look as though he\'s been able to do anything at allabout that JTR car ahead.No. In terms of pure speed,he hasn\'t been able to threaten Gus Burton at all, has he? Gus Burton has led every race.As he\'s now passing David Fairbrotherat the back of the field.But he\'s had this race under control.But unfortunately, he\'s got this investigation after the racefor a possible false start hanging over him.And if, if, if anything is found, and it\'s a false start,normally that\'s a ten-second penalty,and he\'s not ten seconds ahead,so there is gonna be a postscript to this story, that\'s for sure.Now, this is Henry Dawes, Ollie Jacksoncoming through the chicane.Dawes goes wide, goes through the gravel,goes over the grass, loses a place,gets it all sideways, but just about saves it by the end of the straight.Yeah, nearly lost it on the wet grass.Oh. Harry Foster.This is passing David Fairbrother again, further back.So, this is Smalley versus Matty Graham for second place.So, this gap has come r. \n\n Your task is to create long detailed paragraph-by-paragraph summary. Detailed paragraph-by-paragraph summary of the text above: \ No newline at end of file diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py deleted file mode 100644 index a491ffa763505e1d447c9647b8d6858f1ecee72d..0000000000000000000000000000000000000000 --- a/tests/samplers/test_beam_search.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Compare the outputs of HF and vLLM when using beam search. - -Run `pytest tests/samplers/test_beam_search.py --forked`. -""" -import pytest - -# FIXME(zhuohan): The test can not pass if we: -# 1. Increase max_tokens to 256. -# 2. Increase beam_width to 8. -# 3. Use the model "huggyllama/llama-7b". -MAX_TOKENS = [128] -BEAM_WIDTHS = [4] -MODELS = ["facebook/opt-125m"] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", MAX_TOKENS) -@pytest.mark.parametrize("beam_width", BEAM_WIDTHS) -def test_beam_search_single_input( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - beam_width: int, -) -> None: - hf_model = hf_runner(model, dtype=dtype) - hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, - max_tokens) - del hf_model - - vllm_model = vllm_runner(model, dtype=dtype) - vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width, - max_tokens) - del vllm_model - - for i in range(len(example_prompts)): - hf_output_ids, _ = hf_outputs[i] - vllm_output_ids, _ = vllm_outputs[i] - assert len(hf_output_ids) == len(vllm_output_ids) - for j in range(len(hf_output_ids)): - assert hf_output_ids[j] == vllm_output_ids[j], ( - f"Test{i} output{j}:\nHF: {hf_output_ids}\n" - f"vLLM: {vllm_output_ids}") diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py deleted file mode 100644 index 0ea3704462fcbac4d9be73170406c261f80ffea5..0000000000000000000000000000000000000000 --- a/tests/samplers/test_logprobs.py +++ /dev/null @@ -1,56 +0,0 @@ -import pytest -import torch - -from vllm import SamplingParams - -MODELS = ["facebook/opt-125m"] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -def test_get_prompt_logprobs( - hf_runner, - vllm_runner, - model, - dtype, - example_prompts, -): - max_tokens = 5 - hf_model = hf_runner(model, dtype=dtype) - hf_logprobs = hf_model.generate_greedy_logprobs( - example_prompts, - max_tokens=max_tokens, - ) - del hf_model - - vllm_model = vllm_runner(model, dtype=dtype) - vllm_sampling_params = SamplingParams(max_tokens=max_tokens, - logprobs=5, - prompt_logprobs=5, - temperature=0.0) - vllm_results = vllm_model.model.generate( - example_prompts, sampling_params=vllm_sampling_params) - del vllm_model - - # Test whether logprobs are included in the results. - for result in vllm_results: - assert result.prompt_logprobs is not None - assert result.outputs[0].logprobs is not None - - # Test whether prompt logprobs are consistent with HF - for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs): - # Check prompt logprobs - vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:] - for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs): - for token_id, logprob in vllm_prompt_logprob_dict.items(): - torch.testing.assert_close(logprob, - hf_logprob[0][i][token_id].item(), - atol=1e-2, - rtol=1e-2) - vllm_sample_logprobs = vllm_result.outputs[0].logprobs - for i, vllm_sample_logprob_dict in enumerate(vllm_sample_logprobs): - for token_id, logprob in vllm_sample_logprob_dict.items(): - torch.testing.assert_close(logprob, - hf_logprob[i][-1][token_id].item(), - atol=1e-2, - rtol=1e-2) diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py deleted file mode 100644 index 9d3ef3c67d3dcbc013e47d45f7af5c401470c1f7..0000000000000000000000000000000000000000 --- a/tests/samplers/test_rejection_sampler.py +++ /dev/null @@ -1,392 +0,0 @@ -"""Tests for rejection sampling.""" -import pytest -from typing import List, Tuple - -import torch -import torch.nn.functional as F - -from vllm.model_executor.utils import set_random_seed - -from vllm.model_executor.layers.rejection_sampler import RejectionSampler - - -def mock_causal_accepted_tensor( - k: int, last_accepted_indices: torch.Tensor) -> torch.Tensor: - """Generate an "accepted" tensor which should yield causally-accepted tokens - up to last accepted indices. - - Tokens after last_accepted_indices+1 may also be accepted, although they - will not be causally accepted. - """ - batch_size = last_accepted_indices.shape[0] - - accepted = (torch.arange(k).expand(batch_size, k) <= - last_accepted_indices.unsqueeze(-1).broadcast_to( - batch_size, k)).to(device="cuda") - - # Sprinkle accepted values after the contiguous initial accepted values. - # This replicates the behavior of rejection sampling, which may "accept" - # a token that cannot be accepted because of causality. - sprinkle_candidates = ( - torch.arange(k).expand(batch_size, k) > - last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1) - sprinkle = torch.rand(batch_size, k, device="cuda") > 0.5 - accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates] - return accepted - - -@pytest.mark.parametrize("seed", list(range(10))) -@pytest.mark.parametrize( - "which_tokens_accepted", - ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"]) -@torch.inference_mode() -def test_correct_output_format(which_tokens_accepted: str, seed: int): - """Verify the output has correct format given predetermined accepted matrix. - """ - set_random_seed(seed) - - batch_size = 10 - k = 5 - vocab_size = 3000 - - if which_tokens_accepted == "all_tokens_accepted": - accepted = mock_causal_accepted_tensor( - k, -1 + k * torch.ones((batch_size, ), dtype=torch.long)) - elif which_tokens_accepted == "no_tokens_accepted": - accepted = mock_causal_accepted_tensor( - k, -torch.ones((batch_size, ), dtype=torch.long)) - elif which_tokens_accepted == "some_tokens_accepted": - last_accepted_indices = torch.randint(low=-1, - high=k, - size=(batch_size, )) - accepted = mock_causal_accepted_tensor(k, last_accepted_indices) - else: - raise AssertionError() - - recovered_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64, - device="cuda") - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64, - device="cuda") - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64, - device="cuda") - - rejection_sampler = RejectionSampler() - rejection_sampler.init_gpu_tensors(rank=0) - output_token_ids = rejection_sampler._create_output( # pylint: disable=protected-access - accepted, - recovered_token_ids, - draft_token_ids, - bonus_token_ids, - ) - - if which_tokens_accepted == "all_tokens_accepted": - # Expect all tokens to be equal to draft tokens. - assert torch.equal(output_token_ids[:, :-1], draft_token_ids) - - # Expect all bonus tokens to be included. - assert torch.equal(output_token_ids[:, -1:], bonus_token_ids) - elif which_tokens_accepted == "no_tokens_accepted": - # Expect first token to be equal to recovered tokens. - assert torch.equal(output_token_ids[:, 0], recovered_token_ids[:, 0]) - - # Expect everything else to be -1. - assert torch.equal(output_token_ids[:, 1:], - torch.ones_like(output_token_ids[:, 1:]) * -1) - elif which_tokens_accepted == "some_tokens_accepted": - recovered_plus_bonus = torch.cat( - (recovered_token_ids, bonus_token_ids), dim=-1) - # Assert first rejected token is a recovered token or bonus token. - assert torch.equal( - recovered_plus_bonus[torch.arange(0, batch_size), - last_accepted_indices + 1], - output_token_ids[torch.arange(0, batch_size), - last_accepted_indices + 1]) - - # Assert every subsequent token is -1. - subsequent_mask = torch.arange(0, k + 1).expand( - batch_size, k + 1) >= (last_accepted_indices + 2).unsqueeze(-1) - assert torch.all(output_token_ids[subsequent_mask] == -1) - - -@pytest.mark.parametrize("k", list(range(1, 6))) -@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) -@pytest.mark.parametrize("batch_size", list(range(1, 32))) -@torch.inference_mode() -def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int): - rejection_sampler = RejectionSampler() - rejection_sampler.init_gpu_tensors(rank=0) - - draft_probs = torch.rand(batch_size, - k, - vocab_size, - dtype=torch.float32, - device="cuda") - target_probs = torch.rand(batch_size, - k, - vocab_size, - dtype=torch.float32, - device="cuda") - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64, - device="cuda") - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64, - device="cuda") - - rejection_sampler(target_probs, bonus_token_ids, draft_probs, - draft_token_ids) - - -@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"]) -@pytest.mark.parametrize("which_token_ids", - ["bonus_token_ids", "draft_token_ids"]) -@torch.inference_mode() -def test_raises_when_vocab_oob(above_or_below_vocab_range: str, - which_token_ids: str): - k = 3 - batch_size = 5 - vocab_size = 30_000 - - rejection_sampler = RejectionSampler(strict_mode=True) - rejection_sampler.init_gpu_tensors(rank=0) - - draft_probs = torch.rand(batch_size, - k, - vocab_size, - dtype=torch.float32, - device="cuda") - target_probs = torch.rand(batch_size, - k, - vocab_size, - dtype=torch.float32, - device="cuda") - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64, - device="cuda") - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64, - device="cuda") - - oob_token_ids = None - if which_token_ids == "bonus_token_ids": - oob_token_ids = bonus_token_ids - elif which_token_ids == "draft_token_ids": - oob_token_ids = draft_token_ids - else: - raise AssertionError() - - if above_or_below_vocab_range == "above": - rogue_token_id = vocab_size + 1 - elif above_or_below_vocab_range == "below": - rogue_token_id = -1 - else: - raise AssertionError() - - oob_token_ids[0][0] = rogue_token_id - - with pytest.raises(AssertionError): - rejection_sampler(target_probs, bonus_token_ids, draft_probs, - draft_token_ids) - - -@pytest.mark.parametrize("draft_and_target_probs_equal", [True, False]) -@pytest.mark.parametrize("seed", list(range(5))) -@torch.inference_mode() -def test_rejection_sampling_approximates_target_distribution( - seed: int, draft_and_target_probs_equal: bool): - """Verify rejection sampling approximates target distribution, - despite sampling from a potentially distinct draft distribution. - - This is done by first creating a random target probability - distribution and a random draft probability distribution. We then - sample token ids from the rejection sampler using these draft - and target distributions. The samples are used to estimate - the output probability distribution, which we expect to approximate - the target distribution. - - A basic distance metric is used to determine similarity between - distributions. - - We expect that as we increase the number of samples, - the distance between the observed distribution and the target - distribution decreases. To measure this, we compare the distance - of the observed distribution against both the target distribution - and a uniform random distribution. We expect the distance between - the observed distribution and the target distribution to improve - much more than the distance improvement between the observed - distribution and the random distribution. - - When draft_and_target_probs_equal=True, the draft and target - probabilities are exactly equal. Rejection sampling should - still work without any NaNs or exceptions. - """ - set_random_seed(seed) - - helper = _CorrectnessTestHelper( - vocab_size=10, - rejection_sampler=RejectionSampler(), - ) - - draft_probs, target_probs, reference_probs = helper.generate_probs_for_test( - draft_and_target_probs_equal) - - sample_sizes = [10, 100, 1_000, 10_000, 100_000] - distance_wrt_reference = [] - distance_wrt_target = [] - - for num_samples in sample_sizes: - (reference_vs_rejsample_dist, - target_vs_rejsample_dist) = helper.run_and_compare_distributions( - draft_probs, - target_probs, - reference_probs, - num_samples, - ) - - distance_wrt_reference.append(reference_vs_rejsample_dist) - distance_wrt_target.append(target_vs_rejsample_dist) - - relative_change_in_distance_wrt_target = get_ratio_first_to_last( - distance_wrt_target) - relative_change_in_distance_wrt_reference = get_ratio_first_to_last( - distance_wrt_reference) - - print(f"{num_samples=} {target_vs_rejsample_dist=:.05f} " - f"{reference_vs_rejsample_dist=:.05f}") - print(f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} " - f"{relative_change_in_distance_wrt_reference=:.02f}") - - relative_change_in_distance_wrt_target = get_ratio_first_to_last( - distance_wrt_target) - relative_change_in_distance_wrt_reference = get_ratio_first_to_last( - distance_wrt_reference) - - expected_improvement_multiplier = 20 - assert (relative_change_in_distance_wrt_target > - relative_change_in_distance_wrt_reference * - expected_improvement_multiplier) - - -def get_ratio_first_to_last(elements: List[float]) -> float: - return elements[0] / elements[-1] - - -class _CorrectnessTestHelper: - """Class that packages together logic required for the unit-level - rejection sampling correctness test. - """ - - def __init__(self, vocab_size: int, rejection_sampler: RejectionSampler): - self.rejection_sampler = rejection_sampler - self.vocab_size = vocab_size - self.vocab_range = (0, vocab_size) - - self.rejection_sampler.init_gpu_tensors(rank=0) - - # Keep test simple, use k=1 - self.k = 1 - - # Bonus tokens not used, but rejection sampler requires - # correct shape. - self.num_bonus_tokens = 1 - - def generate_probs_for_test( - self, draft_and_target_probs_equal: bool - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - draft_probs, target_probs = [ - F.softmax( - torch.rand(self.vocab_size, dtype=torch.float32), - dim=-1, - ) for _ in range(2) - ] - - num_reference_probs = 100 - reference_probs = F.softmax( - torch.rand(num_reference_probs, - self.vocab_size, - dtype=torch.float32), - dim=-1, - ) - - if draft_and_target_probs_equal: - target_probs = draft_probs.clone() - - return draft_probs, target_probs, reference_probs - - def run_and_compare_distributions(self, draft_probs: torch.Tensor, - target_probs: torch.Tensor, - reference_probs: torch.Tensor, - num_samples: int) -> Tuple[float, float]: - # Sample using rejection sampling. - rej_sample_probs = self._estimate_rejection_sampling_pdf( - draft_probs, target_probs, num_samples) - - # Average distance from reference probs. - reference_vs_rejsample_dist = torch.dist( - reference_probs, - rej_sample_probs).item() / reference_probs.shape[0] - target_vs_rejsample_dist = torch.dist(target_probs, - rej_sample_probs).item() - - return reference_vs_rejsample_dist, target_vs_rejsample_dist - - def _estimate_rejection_sampling_pdf( - self, - draft_probs: torch.Tensor, - target_probs: torch.Tensor, - num_samples: int, - ) -> torch.Tensor: - # Repeat draft probs num_samples times. - draft_probs = draft_probs.reshape(1, self.k, self.vocab_size).repeat( - num_samples, 1, 1) - - # Repeat target probs num_samples * k times. - # Rejection sampler requires bonus token probs, but they aren't used. - target_probs = target_probs.reshape(1, 1, self.vocab_size).repeat( - num_samples, self.k, 1) - - # Randomly sample draft token ids from draft probs. - draft_token_ids = torch.multinomial(draft_probs[:, 0, :], - num_samples=1, - replacement=True).reshape( - num_samples, self.k) - - # Bonus tokens not used but required. - bonus_token_ids = torch.zeros((1, self.num_bonus_tokens), - dtype=torch.int64, - device="cuda").repeat(num_samples, 1) - - # Get output tokens via rejection sampling. - output_token_ids = self.rejection_sampler(target_probs.to("cuda"), - bonus_token_ids.to("cuda"), - draft_probs.to("cuda"), - draft_token_ids.to("cuda")) - - # Remove bonus tokens - output_token_ids = output_token_ids[:, :-1].flatten() - - # Estimate probability density function - hist = torch.histogram(output_token_ids.to(dtype=torch.float, - device="cpu"), - bins=self.vocab_size, - range=self.vocab_range, - density=True) - - return hist.hist diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py deleted file mode 100644 index 962183a29fbfafc5d8df7c485c0c9b58cdd4157e..0000000000000000000000000000000000000000 --- a/tests/samplers/test_sampler.py +++ /dev/null @@ -1,316 +0,0 @@ -import random -from typing import Tuple -from unittest.mock import patch - -import pytest -import torch -from transformers import GenerationConfig, GenerationMixin - -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.utils import set_random_seed -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.worker.model_runner import ModelRunner - - -class MockLogitsSampler(Sampler): - - def __init__(self, vocab_size: int, fake_logits: torch.Tensor): - super().__init__(vocab_size=vocab_size) - self.fake_logits = fake_logits - - def forward(self, *args, **kwargs): - with patch( - "vllm.model_executor.layers.sampler._prune_hidden_states", - lambda x, y: x), patch( - "vllm.model_executor.layers.sampler.Sampler._get_logits", - lambda *args, **kwargs: self.fake_logits): - return super().forward(*args, **kwargs) - - -def _prepare_test( - batch_size: int -) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, ModelRunner]: - vocab_size = 32000 - input_tensor = torch.rand((batch_size, 1024), - device="cuda", - dtype=torch.float16) - fake_logits = torch.full((batch_size, vocab_size), - 1e-2, - device=input_tensor.device, - dtype=input_tensor.dtype) - sampler = MockLogitsSampler(32000, fake_logits) - model_runner = ModelRunner(None, None, None, None) - return input_tensor, fake_logits, sampler, model_runner - - -RANDOM_SEEDS = list(range(128)) - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -def test_sampler_all_greedy(seed: int): - set_random_seed(seed) - batch_size = random.randint(1, 256) - input_tensor, fake_logits, sampler, model_runner = _prepare_test( - batch_size) - - seq_group_metadata_list = [] - prompt_lens = [] - for i in range(batch_size): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData([1, 2, 3])}, - sampling_params=SamplingParams(temperature=0, ), - block_tables={0: [1]}, - )) - prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list, - prompt_lens, - subquery_lens=prompt_lens) - sampler_output = sampler(embedding=None, - hidden_states=input_tensor, - sampling_metadata=sampling_metadata) - expected = torch.argmax(fake_logits, dim=-1) - for i, sequence_output in enumerate(sampler_output): - for nth_output in sequence_output.samples: - assert nth_output.output_token == expected[i].item() - - del model_runner - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -def test_sampler_all_random(seed: int): - set_random_seed(seed) - batch_size = random.randint(1, 256) - input_tensor, fake_logits, sampler, model_runner = _prepare_test( - batch_size) - - for i in range(batch_size): - fake_logits[i, i] = 1e2 - - seq_group_metadata_list = [] - prompt_lens = [] - for i in range(batch_size): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData([1, 2, 3])}, - sampling_params=SamplingParams( - temperature=1.0, - n=random.randint(1, 10), - ), - block_tables={0: [1]}, - )) - prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list, - prompt_lens, - subquery_lens=prompt_lens) - sampler_output = sampler(embedding=None, - hidden_states=input_tensor, - sampling_metadata=sampling_metadata) - for i, sequence_output in enumerate(sampler_output): - for nth_output in sequence_output.samples: - assert nth_output.output_token == i - - del model_runner - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -def test_sampler_all_beam(seed: int): - set_random_seed(seed) - batch_size = random.randint(1, 256) - input_tensor, _, sampler, model_runner = _prepare_test(batch_size) - - seq_group_metadata_list = [] - prompt_lens = [] - for i in range(batch_size): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData([1, 2, 3])}, - sampling_params=SamplingParams( - temperature=0, - best_of=2, - use_beam_search=True, - ), - block_tables={0: [1]}, - )) - prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list, - prompt_lens, - subquery_lens=prompt_lens) - sampler(embedding=None, - hidden_states=input_tensor, - sampling_metadata=sampling_metadata) - # no assertion here as I am not sure how to determine whether - # the outputs are expected - in other words, this just tests - # whether there are no exceptions in the sampler - # when handling an all-beam search case. - del model_runner - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -def test_sampler_mixed(seed: int): - set_random_seed(seed) - batch_size = random.randint(1, 256) - input_tensor, fake_logits, sampler, model_runner = _prepare_test( - batch_size) - - seq_group_metadata_list = [] - expected_tokens = [] - prompt_lens = [] - for i in range(batch_size): - n = 1 - sampling_type = random.randint(0, 2) - if sampling_type == 0: - sampling_params = SamplingParams(temperature=0) - elif sampling_type == 1: - n = random.randint(1, 10) - sampling_params = SamplingParams( - temperature=random.random() + 0.1, - top_p=min(random.random() + 0.1, 1), - top_k=random.randint(0, 10) or -1, - n=n, - presence_penalty=random.randint(0, 1), - ) - else: - sampling_params = SamplingParams(temperature=0, - use_beam_search=True, - best_of=2) - for idx in range(n): - fake_logits[i, i + idx] = 1e2 - expected_tokens.append(i + idx) - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData([1, 2, 3])}, - sampling_params=sampling_params, - block_tables={0: [1]}, - )) - prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list, - prompt_lens, - subquery_lens=prompt_lens) - sampler_output = sampler(embedding=None, - hidden_states=input_tensor, - sampling_metadata=sampling_metadata) - for i, sequence_output in enumerate(sampler_output): - if seq_group_metadata_list[i].sampling_params.use_beam_search: - continue - for nth_output in sequence_output.samples: - assert nth_output.output_token in expected_tokens - - del model_runner - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -def test_sampler_logits_processors(seed: int): - set_random_seed(seed) - batch_size = random.randint(1, 256) - input_tensor, _, sampler, model_runner = _prepare_test(batch_size) - - # This sample logits processor gives infinite score to the i-th token, - # where i is the length of the input sequence. - # We therefore expect the output token sequence to be [0, 1, 2, ...] - def pick_ith(token_ids, logits): - logits[len(token_ids)] = float("inf") - return logits - - seq_group_metadata_list = [] - prompt_lens = [] - for i in range(batch_size): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData([1, 2, 3])}, - sampling_params=SamplingParams(temperature=0, - logits_processors=[pick_ith]), - block_tables={0: [1]}, - )) - prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list, - prompt_lens, - subquery_lens=prompt_lens) - sampler_output = sampler(embedding=None, - hidden_states=input_tensor, - sampling_metadata=sampling_metadata) - for _, sequence_output in enumerate(sampler_output): - for idx, nth_output in enumerate(sequence_output.samples): - assert nth_output.output_token == idx - - del model_runner - - -@pytest.mark.parametrize("seed", RANDOM_SEEDS) -def test_sampler_top_k_top_p(seed: int): - set_random_seed(seed) - batch_size = random.randint(1, 256) - top_k = random.randint(100, 500) - top_p = random.random() * 0.1 - vocab_size = 32000 - input_tensor = torch.rand((batch_size, 1024), - device="cuda", - dtype=torch.float16) - fake_logits = torch.normal(0, - 5, - size=(batch_size, vocab_size), - device=input_tensor.device, - dtype=input_tensor.dtype) - sampler = MockLogitsSampler(32000, fake_logits) - model_runner = ModelRunner(None, None, None, None) - - generation_model = GenerationMixin() - generation_config = GenerationConfig(top_k=top_k, - top_p=top_p, - do_sample=True) - warpers = generation_model._get_logits_warper(generation_config) - assert len(warpers) == 2 # top_p and top_k - - seq_group_metadata_list = [] - prompt_lens = [] - for i in range(batch_size): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData([1, 2, 3])}, - sampling_params=SamplingParams( - temperature=1, - top_k=top_k, - top_p=top_p, - ), - block_tables={0: [1]}, - )) - prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list, - prompt_lens, - subquery_lens=prompt_lens) - - sample_probs = None - - def mock_sample(probs, logprobs, sampling_metadata): - nonlocal sample_probs - sample_probs = probs - return [[prob.topk(1, dim=-1).indices.tolist(), [0]] for prob in probs] - - with patch("vllm.model_executor.layers.sampler._sample", mock_sample): - sampler(embedding=None, - hidden_states=input_tensor, - sampling_metadata=sampling_metadata) - hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone()) - hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) - assert torch.allclose(hf_probs, sample_probs, atol=1e-5) - assert torch.equal(hf_probs.eq(0), sample_probs.eq(0)) - - del model_runner diff --git a/tests/test_regression.py b/tests/test_regression.py deleted file mode 100644 index c48e474bd889f407a0c8c56dc198828211cd92ec..0000000000000000000000000000000000000000 --- a/tests/test_regression.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Containing tests that check for regressions in vLLM's behavior. - -It should include tests that are reported by users and making sure they -will never happen again. - -""" -from vllm import LLM, SamplingParams - - -def test_duplicated_ignored_sequence_group(): - """https://github.com/vllm-project/vllm/issues/1655""" - - sampling_params = SamplingParams(temperature=0.01, - top_p=0.1, - max_tokens=256) - llm = LLM(model="facebook/opt-125m", - max_num_batched_tokens=4096, - tensor_parallel_size=1) - prompts = ["This is a short prompt", "This is a very long prompt " * 1000] - outputs = llm.generate(prompts, sampling_params=sampling_params) - - assert len(prompts) == len(outputs) - - -def test_max_tokens_none(): - sampling_params = SamplingParams(temperature=0.01, - top_p=0.1, - max_tokens=None) - llm = LLM(model="facebook/opt-125m", - max_num_batched_tokens=4096, - tensor_parallel_size=1) - prompts = ["Just say hello!"] - outputs = llm.generate(prompts, sampling_params=sampling_params) - - assert len(prompts) == len(outputs) - - -if __name__ == "__main__": - import pytest - pytest.main([__file__]) diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py deleted file mode 100644 index 01cbe0c997f298a9cf86d342244f9122e38cdcc2..0000000000000000000000000000000000000000 --- a/tests/test_sampling_params.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Tests for the SamplingParams class. -""" -from vllm import SamplingParams - - -def test_max_tokens_none(): - """max_tokens=None should be allowed""" - SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None) - - -if __name__ == "__main__": - import pytest - pytest.main([__file__]) diff --git a/tests/worker/__init__.py b/tests/worker/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/tests/worker/spec_decode/__init__.py b/tests/worker/spec_decode/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/tests/worker/spec_decode/test_multi_step_worker.py b/tests/worker/spec_decode/test_multi_step_worker.py deleted file mode 100644 index ea548029035780cc3f53115f60c2183dc43a75cb..0000000000000000000000000000000000000000 --- a/tests/worker/spec_decode/test_multi_step_worker.py +++ /dev/null @@ -1,261 +0,0 @@ -import torch -import random -import pytest -from unittest.mock import MagicMock - -from vllm.worker.spec_decode.multi_step_worker import MultiStepWorker -from vllm.worker.worker import Worker -from vllm.model_executor.utils import set_random_seed - -from .utils import (create_execute_model_data, create_worker, - create_seq_group_metadata_from_prompts, zero_kv_cache, - patch_execute_model_with_seeds, - assert_logprobs_dict_allclose) - - -@pytest.mark.parametrize('num_steps', list(range(1, 17))) -def test_assert_enough_kv_space(num_steps: int): - """Test that the multi step worker checks for sufficient space in the KV - cache. It should throw if it cannot run all the steps. - """ - block_size = 16 - num_gpu_blocks = 2048 // block_size - - prompts = [ - list(range(block_size * 3)), - list(range(block_size * 2)), - ] - - prev_output_tokens = [ - list(range(block_size * 1)), - list(range(block_size * 2)), - ] - - final_seq_lens = [ - len(prompt + output) + num_steps - for prompt, output in zip(prompts, prev_output_tokens) - ] - - inputs = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - final_seq_lens, - continuations=prev_output_tokens) - - assert_enough_kv_space = MultiStepWorker._assert_enough_kv_space # pylint: disable=protected-access - worker = MagicMock() - worker.model_runner.block_size = block_size - - for seq_group_metadata in inputs: - original_block_tables = seq_group_metadata.block_tables - - # No exception. - assert_enough_kv_space(worker, inputs, num_steps) - - seq_group_metadata.block_tables = { - seq_id: [] - for seq_id, physical_blocks in original_block_tables.items() - } - - # Expect exception. - with pytest.raises(ValueError, - match='times but found insufficient KV space for'): - assert_enough_kv_space(worker, inputs, num_steps) - - seq_group_metadata.block_tables = original_block_tables - - -@torch.inference_mode() -def test_same_output_for_single_step(): - """Verify the multi step worker produces the same output as the normal - worker for num_steps=1. - """ - seed = 100 - model_name = 'JackFram/llama-68m' - - block_size = 32 - num_gpu_blocks = 2048 // block_size - multi_step_worker = create_worker( - MultiStepWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - worker = create_worker( - Worker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - multi_step_worker.model_runner = worker.model_runner - multi_step_worker.cache_engine = worker.cache_engine - - num_steps = 1 - - prompts = [ - [1, 2, 3, 4, 5], - [6, 7, 8, 9, 10], - ] - - final_seq_lens = [len(prompt) + num_steps for prompt in prompts] - - multi_step_execute_model_data = create_execute_model_data( - seq_group_metadata_list=create_seq_group_metadata_from_prompts( - prompts, num_gpu_blocks, block_size, - final_seq_lens=final_seq_lens)) - - single_step_execute_model_data = create_execute_model_data( - seq_group_metadata_list=create_seq_group_metadata_from_prompts( - prompts, num_gpu_blocks, block_size, - final_seq_lens=final_seq_lens)) - - zero_kv_cache(multi_step_worker.cache_engine) - set_random_seed(seed) - actual_output = multi_step_worker.execute_model_multi_step( - **multi_step_execute_model_data.to_dict(), num_steps=num_steps) - assert len(actual_output) == num_steps - actual_output = actual_output[0] - - zero_kv_cache(worker.cache_engine) - set_random_seed(seed) - expected_output = worker.execute_model( - **single_step_execute_model_data.to_dict(), ) - - actual_token_ids = [ - output.samples[0].output_token for output in actual_output - ] - actual_logprobs = [output.samples[0].logprobs for output in actual_output] - - expected_token_ids = [ - output.samples[0].output_token for output in expected_output - ] - expected_logprobs = [ - output.samples[0].logprobs for output in expected_output - ] - - assert actual_token_ids == expected_token_ids - - print(f'{actual_logprobs=}') - print(f'{expected_logprobs=}') - assert_logprobs_dict_allclose(actual_logprobs, expected_logprobs) - - -@torch.inference_mode() -def test_same_output_for_multi_step(): - """Verify the multi-step worker produces the same output as the normal - worker when num_steps > 1. This test runs the multi-step worker once, and - then runs the worker num_steps times, and compares the output. - """ - seed = 100 - model_name = 'JackFram/llama-68m' - - block_size = 16 - num_gpu_blocks = 2048 // block_size - multi_step_worker = create_worker( - MultiStepWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - - worker = create_worker( - Worker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - - # Make sure we go over the block boundary. - num_steps = block_size + 1 - - random.seed(seed) - prompts = [[ - random.randint(0, 1000) for _ in range(random.randint(10, 20)) - ] for _ in range(10)] - - final_seq_lens = [len(prompt) + num_steps for prompt in prompts] - - rand_seeds = list(random.randint(0, 100) for _ in range(num_steps)) - multi_step_worker.execute_model = patch_execute_model_with_seeds( - multi_step_worker, rand_seeds) - worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds) - - continuations = [[1] for _ in prompts] - execute_model_data = create_execute_model_data( - create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=continuations, - final_seq_lens=final_seq_lens), ) - - # Run multi-step. - zero_kv_cache(multi_step_worker.cache_engine) - set_random_seed(seed) - multi_step_output = multi_step_worker.execute_model_multi_step( - **execute_model_data.to_dict(), num_steps=num_steps) - - # Run single-step repeatedly. - zero_kv_cache(worker.cache_engine) - single_step_output = [] - continuations = [[1] for _ in prompts] - set_random_seed(seed) - - for _ in multi_step_output: - - execute_model_data = create_execute_model_data( - create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=continuations, - final_seq_lens=final_seq_lens)) - - single_step_output.append( - worker.execute_model(**execute_model_data.to_dict(), )) - - # Append output tokens to new sequence data. - for i, seq_group_output in enumerate(single_step_output[-1]): - continuations[i].append(seq_group_output.samples[0].output_token) - - # Get token ids and logprobs for comparison. - multi_step_output_logprobs = [[] for _ in prompts] - single_step_output_logprobs = [[] for _ in prompts] - - multi_step_output_token_ids = [[] for _ in prompts] - single_step_output_token_ids = [[] for _ in prompts] - for i, _ in enumerate(prompts): - for multi_step, single_step in zip(multi_step_output, - single_step_output): - multi_step_output_token_ids[i].append( - multi_step[i].samples[0].output_token) - single_step_output_token_ids[i].append( - single_step[i].samples[0].output_token) - - multi_step_output_logprobs[i].append( - multi_step[i].samples[0].logprobs) - single_step_output_logprobs[i].append( - single_step[i].samples[0].logprobs) - - # Print per-sequence token ids - for i, (multi_step_tokens, single_step_tokens) in enumerate( - zip(multi_step_output_token_ids, single_step_output_token_ids)): - print(f'{i=} {multi_step_tokens=}') - print(f'{i=} {single_step_tokens=}') - print(f'{i=} equal {multi_step_tokens == single_step_tokens}') - - # Assert token ids are equal. - for multi_step_tokens, single_step_tokens in zip( - multi_step_output_token_ids, single_step_output_token_ids): - assert multi_step_tokens == single_step_tokens - - # Assert logprobs are equal. - for multi_step_logprobs, single_step_logprobs in zip( - multi_step_output_logprobs, single_step_output_logprobs): - assert_logprobs_dict_allclose(multi_step_logprobs, - single_step_logprobs) diff --git a/tests/worker/spec_decode/utils.py b/tests/worker/spec_decode/utils.py deleted file mode 100644 index e0db770046ec849800fc87568ac8cb73644cec40..0000000000000000000000000000000000000000 --- a/tests/worker/spec_decode/utils.py +++ /dev/null @@ -1,177 +0,0 @@ -import torch -from typing import List, Optional, Dict - -from vllm.worker.worker import Worker -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.engine.arg_utils import EngineArgs -from vllm.sequence import SequenceGroupMetadata, SequenceData -from vllm.sampling_params import SamplingParams -from vllm.worker.cache_engine import CacheEngine -from vllm.model_executor.utils import set_random_seed -from dataclasses import dataclass, fields - - -@dataclass -class ExecuteModelData: - """Helper data structure which facilitates cleaner tests. - """ - seq_group_metadata_list: List[SequenceGroupMetadata] - blocks_to_swap_in: Dict[int, int] - blocks_to_swap_out: Dict[int, int] - blocks_to_copy: Dict[int, List[int]] - - def to_dict(self): - return dict( - (field.name, getattr(self, field.name)) for field in fields(self)) - - -def round_up_to_next_block(seq_len: int, block_size: int) -> int: - return (seq_len + block_size - 1) // block_size - - -def create_execute_model_data( - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Optional[Dict[int, int]] = None, - blocks_to_swap_out: Optional[Dict[int, int]] = None, - blocks_to_copy: Optional[Dict[int, int]] = None, -) -> ExecuteModelData: - if blocks_to_swap_in is None: - blocks_to_swap_in = {} - if blocks_to_swap_out is None: - blocks_to_swap_out = {} - if blocks_to_copy is None: - blocks_to_copy = {} - - return ExecuteModelData( - seq_group_metadata_list=seq_group_metadata_list, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - ) - - -def patch_execute_model_with_seeds(worker: Worker, rand_seeds: List[int]): - seed_iter = iter(rand_seeds) - original_execute_model = worker.execute_model - - def new_execute_model(*args, **kwargs): - result = original_execute_model(*args, **kwargs) - set_random_seed(next(seed_iter)) - return result - - return new_execute_model - - -def zero_kv_cache(cache_engine: CacheEngine): - assert cache_engine.gpu_cache - for key_blocks, value_blocks in cache_engine.gpu_cache: - key_blocks.zero_() - value_blocks.zero_() - - -def create_worker(cls: type, - model_name: str, - block_size: int, - num_gpu_blocks: int, - seed: int, - is_driver_worker: bool = True, - enforce_eager: bool = True): - engine_args = EngineArgs( - model=model_name, - seed=seed, - block_size=block_size, - enforce_eager=enforce_eager, - ) - - (model_config, cache_config, parallel_config, scheduler_config, - _) = engine_args.create_engine_configs() - - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - - worker = cls( - model_config=model_config, - parallel_config=parallel_config, - scheduler_config=scheduler_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - is_driver_worker=is_driver_worker, - ) - - worker.init_model() - worker.load_model() - - cache_config.num_gpu_blocks = num_gpu_blocks - cache_config.num_cpu_blocks = 0 - worker.init_cache_engine(cache_config) - worker.warm_up_model() - - return worker - - -def create_seq_group_metadata_from_prompts( - prompts: List[List[int]], - num_gpu_blocks: int, - block_size: int, - final_seq_lens: List[int], - continuations: Optional[List[List[int]]] = None, - num_tokens_processed: Optional[List[int]] = None, - seq_ids: Optional[List[int]] = None, -) -> List[SequenceGroupMetadata]: - - if continuations is None: - continuations = [[] for _ in prompts] - - if num_tokens_processed is None: - # Default to 1 token missing from kv cache for generation sequences. - num_tokens_processed = [] - for continuation, prompt in zip(continuations, prompts): - # If prefill, then default to zero tokens processed. - if not continuation: - num_tokens_processed.append(0) - else: - # If generation, then default to all but one tokens processed. - num_tokens_processed.append( - len(continuation) + len(prompt) - 1) - - if seq_ids is None: - seq_ids = list(i for i, _ in enumerate(prompts)) - - free_gpu_blocks = list(range(num_gpu_blocks)) - - block_allocations = { - i: [ - free_gpu_blocks.pop() - for _ in range(round_up_to_next_block(final_len, block_size)) - ] - for i, final_len in enumerate(final_seq_lens) - } - - return [ - SequenceGroupMetadata( - request_id=str(i), - is_prompt=len(cont_token_ids) == 0, - seq_data={ - i: - SequenceData(prompt_token_ids=prompt_token_ids[:] + - cont_token_ids[:]) - }, - sampling_params=SamplingParams(temperature=0.0, ), - block_tables={i: block_allocations[i][:]}, - ) for i, (prompt_token_ids, cont_token_ids, num_tokens_saved) in - enumerate(zip(prompts, continuations, num_tokens_processed)) - ] - - -def assert_logprobs_dict_allclose( - actual_logprobs: List[Dict[int, float]], - expected_logprobs: List[Dict[int, float]]) -> None: - for single_step_actual_logprobs, single_step_expected_logprobs in zip( - actual_logprobs, expected_logprobs): - assert set(single_step_actual_logprobs.keys()) == set( - single_step_expected_logprobs.keys()) - for token_id in single_step_actual_logprobs: - actual = torch.tensor(single_step_actual_logprobs[token_id]) - expected = torch.tensor(single_step_expected_logprobs[token_id]) - assert torch.allclose(actual, expected) diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py deleted file mode 100644 index 5d9ad0520de13d2cddaeb3132441ec106bed982b..0000000000000000000000000000000000000000 --- a/tests/worker/test_model_runner.py +++ /dev/null @@ -1,50 +0,0 @@ -import random -import torch - -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.worker.model_runner import ModelRunner - - -def test_prepare_prompt(): - model_runner = ModelRunner(None, None, None, None) - model_runner.set_block_size(16) - - batch_size = random.randint(1, 256) - prompt_lens = [] - seq_group_metadata_list = [] - for i in range(batch_size): - # make sure all tokens fit into one block - prompt_len = i % (model_runner.block_size - 1) + 1 - prompt_lens.append(prompt_len) - seq_data = list(range(prompt_len)) - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData(seq_data)}, - sampling_params=SamplingParams(temperature=0), - block_tables={0: [1]}, - )) - - expected_selected_token_indices = [] - selected_token_start_idx = 0 - max_seq_len = max(prompt_lens) - for prompt_len in prompt_lens: - expected_selected_token_indices.append(selected_token_start_idx + - prompt_len - 1) - selected_token_start_idx += max_seq_len - input_tokens, input_positions, _, return_prompt_lens, _, _, _, _ = ( - model_runner._prepare_prompt(seq_group_metadata_list)) - assert return_prompt_lens == prompt_lens - sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list, - prompt_lens, - subquery_lens=prompt_lens) - assert input_tokens.shape == (batch_size, max_seq_len) - assert input_positions.shape == (batch_size, max_seq_len) - torch.testing.assert_close(input_tokens, input_positions) - - actual = sampling_metadata.selected_token_indices - expected = torch.tensor(expected_selected_token_indices, - device=actual.device, - dtype=actual.dtype) - torch.testing.assert_close(actual, expected) diff --git a/vllm/__init__.py b/vllm/__init__.py deleted file mode 100644 index 327dfad06352c05b0564bced0baf90836c0a2c7d..0000000000000000000000000000000000000000 --- a/vllm/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -"""vLLM: a high-throughput and memory-efficient inference engine for LLMs""" - -from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.engine.llm_engine import LLMEngine -from vllm.engine.ray_utils import initialize_cluster -from vllm.entrypoints.llm import LLM -from vllm.outputs import CompletionOutput, RequestOutput -from vllm.sampling_params import SamplingParams - -__version__ = "0.2.7" - -__all__ = [ - "LLM", - "SamplingParams", - "RequestOutput", - "CompletionOutput", - "LLMEngine", - "EngineArgs", - "AsyncLLMEngine", - "AsyncEngineArgs", - "initialize_cluster", -] diff --git a/vllm/block.py b/vllm/block.py deleted file mode 100644 index 5fe39ed47b2ff4effc9abc703c03370b0d142834..0000000000000000000000000000000000000000 --- a/vllm/block.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Token blocks.""" -from typing import List - -from vllm.utils import Device - -_BLANK_TOKEN_ID = -1 - - -class LogicalTokenBlock: - """A block that stores a contiguous chunk of tokens from left to right. - - Logical blocks are used to represent the states of the corresponding - physical blocks in the KV cache. - """ - - def __init__( - self, - block_number: int, - block_size: int, - ) -> None: - self.block_number = block_number - self.block_size = block_size - - self.token_ids = [_BLANK_TOKEN_ID] * block_size - self.num_tokens = 0 - - def is_empty(self) -> bool: - return self.num_tokens == 0 - - def get_num_empty_slots(self) -> int: - return self.block_size - self.num_tokens - - def is_full(self) -> bool: - return self.num_tokens == self.block_size - - def append_tokens(self, token_ids: List[int]) -> None: - assert len(token_ids) <= self.get_num_empty_slots() - curr_idx = self.num_tokens - self.token_ids[curr_idx:curr_idx + len(token_ids)] = token_ids - self.num_tokens += len(token_ids) - - def get_token_ids(self) -> List[int]: - return self.token_ids[:self.num_tokens] - - def get_last_token_id(self) -> int: - assert self.num_tokens > 0 - return self.token_ids[self.num_tokens - 1] - - -class PhysicalTokenBlock: - """Represents the state of a block in the KV cache.""" - - def __init__( - self, - device: Device, - block_number: int, - block_size: int, - ) -> None: - self.device = device - self.block_number = block_number - self.block_size = block_size - - self.ref_count = 0 - - def __repr__(self) -> str: - return (f'PhysicalTokenBlock(device={self.device}, ' - f'block_number={self.block_number}, ' - f'ref_count={self.ref_count})') - - -# Mapping: logical block number -> physical block. -BlockTable = List[PhysicalTokenBlock] diff --git a/vllm/config.py b/vllm/config.py deleted file mode 100644 index 197f20c1ec9a5e8b05877a029deb7592a26cd5f2..0000000000000000000000000000000000000000 --- a/vllm/config.py +++ /dev/null @@ -1,609 +0,0 @@ -from typing import Optional, Union, ClassVar -from dataclasses import dataclass -import os -from packaging.version import Version - -import torch -from transformers import PretrainedConfig - -from vllm.logger import init_logger -from vllm.transformers_utils.config import get_config -from vllm.utils import get_cpu_memory, is_hip, get_nvcc_cuda_version - -logger = init_logger(__name__) - -_GB = 1 << 30 - - -class ModelConfig: - """Configuration for the model. - - Args: - model: Name or path of the huggingface model to use. - tokenizer: Name or path of the huggingface tokenizer to use. - tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if - available, and "slow" will always use the slow tokenizer. - trust_remote_code: Trust remote code (e.g., from HuggingFace) when - downloading the model and tokenizer. - download_dir: Directory to download and load the weights, default to the - default cache directory of huggingface. - load_format: The format of the model weights to load: - "auto" will try to load the weights in the safetensors format and - fall back to the pytorch bin format if safetensors format is - not available. - "pt" will load the weights in the pytorch bin format. - "safetensors" will load the weights in the safetensors format. - "npcache" will load the weights in pytorch format and store - a numpy cache to speed up the loading. - "dummy" will initialize the weights with random values, which is - mainly for profiling. - dtype: Data type for model weights and activations. The "auto" option - will use FP16 precision for FP32 and FP16 models, and BF16 precision - for BF16 models. - seed: Random seed for reproducibility. - revision: The specific model version to use. It can be a branch name, - a tag name, or a commit id. If unspecified, will use the default - version. - tokenizer_revision: The specific tokenizer version to use. It can be a - branch name, a tag name, or a commit id. If unspecified, will use - the default version. - max_model_len: Maximum length of a sequence (including prompt and - output). If None, will be derived from the model. - quantization: Quantization method that was used to quantize the model - weights. If None, we assume the model weights are not quantized. - enforce_eager: Whether to enforce eager execution. If True, we will - disable CUDA graph and always execute the model in eager mode. - If False, we will use CUDA graph and eager execution in hybrid. - max_context_len_to_capture: Maximum context len covered by CUDA graphs. - When a sequence has context length larger than this, we fall back - to eager mode. - """ - - def __init__( - self, - model: str, - tokenizer: str, - tokenizer_mode: str, - trust_remote_code: bool, - download_dir: Optional[str], - load_format: str, - dtype: Union[str, torch.dtype], - seed: int, - revision: Optional[str] = None, - tokenizer_revision: Optional[str] = None, - max_model_len: Optional[int] = None, - quantization: Optional[str] = None, - enforce_eager: bool = False, - max_context_len_to_capture: Optional[int] = None, - ) -> None: - self.model = model - self.tokenizer = tokenizer - self.tokenizer_mode = tokenizer_mode - self.trust_remote_code = trust_remote_code - self.download_dir = download_dir - self.load_format = load_format - self.seed = seed - self.revision = revision - self.tokenizer_revision = tokenizer_revision - self.quantization = quantization - self.enforce_eager = enforce_eager - self.max_context_len_to_capture = max_context_len_to_capture - - if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true": - # download model from ModelScope hub, - # lazy import so that modelscope is not required for normal use. - from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=C - model_path = snapshot_download(model_id=model, - cache_dir=download_dir, - revision=revision) - self.model = model_path - self.download_dir = model_path - self.tokenizer = model_path - - self.hf_config = get_config(self.model, trust_remote_code, revision) - self.dtype = _get_and_verify_dtype(self.hf_config, dtype) - self.max_model_len = _get_and_verify_max_len(self.hf_config, - max_model_len) - self._verify_load_format() - self._verify_tokenizer_mode() - self._verify_quantization() - self._verify_cuda_graph() - - def _verify_load_format(self) -> None: - load_format = self.load_format.lower() - supported_load_format = [ - "auto", "pt", "safetensors", "npcache", "dummy" - ] - rocm_not_supported_load_format = [] - if load_format not in supported_load_format: - raise ValueError( - f"Unknown load format: {self.load_format}. Must be one of " - "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'.") - if is_hip() and load_format in rocm_not_supported_load_format: - rocm_supported_load_format = [ - f for f in supported_load_format - if (f not in rocm_not_supported_load_format) - ] - raise ValueError( - f"load format \'{load_format}\' is not supported in ROCm. " - f"Supported load format are " - f"{rocm_supported_load_format}") - - # TODO: Remove this check once HF updates the pt weights of Mixtral. - architectures = getattr(self.hf_config, "architectures", []) - if "MixtralForCausalLM" in architectures and load_format == "pt": - raise ValueError( - "Currently, the 'pt' format is not supported for Mixtral. " - "Please use the 'safetensors' format instead. ") - self.load_format = load_format - - def _verify_tokenizer_mode(self) -> None: - tokenizer_mode = self.tokenizer_mode.lower() - if tokenizer_mode not in ["auto", "slow"]: - raise ValueError( - f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be " - "either 'auto' or 'slow'.") - self.tokenizer_mode = tokenizer_mode - - def _verify_quantization(self) -> None: - supported_quantization = ["awq", "gptq", "squeezellm"] - rocm_not_supported_quantization = ["awq"] - if self.quantization is not None: - self.quantization = self.quantization.lower() - - # Parse quantization method from the HF model config, if available. - hf_quant_config = getattr(self.hf_config, "quantization_config", None) - if hf_quant_config is not None: - hf_quant_method = str(hf_quant_config["quant_method"]).lower() - if self.quantization is None: - self.quantization = hf_quant_method - elif self.quantization != hf_quant_method: - raise ValueError( - "Quantization method specified in the model config " - f"({hf_quant_method}) does not match the quantization " - f"method specified in the `quantization` argument " - f"({self.quantization}).") - - if self.quantization is not None: - if self.quantization not in supported_quantization: - raise ValueError( - f"Unknown quantization method: {self.quantization}. Must " - f"be one of {supported_quantization}.") - if is_hip( - ) and self.quantization in rocm_not_supported_quantization: - raise ValueError( - f"{self.quantization} quantization is currently not supported " - f"in ROCm.") - logger.warning(f"{self.quantization} quantization is not fully " - "optimized yet. The speed can be slower than " - "non-quantized models.") - - def _verify_cuda_graph(self) -> None: - if self.max_context_len_to_capture is None: - self.max_context_len_to_capture = self.max_model_len - self.max_context_len_to_capture = min(self.max_context_len_to_capture, - self.max_model_len) - - def verify_with_parallel_config( - self, - parallel_config: "ParallelConfig", - ) -> None: - total_num_attention_heads = self.hf_config.num_attention_heads - tensor_parallel_size = parallel_config.tensor_parallel_size - if total_num_attention_heads % tensor_parallel_size != 0: - raise ValueError( - f"Total number of attention heads ({total_num_attention_heads})" - " must be divisible by tensor parallel size " - f"({tensor_parallel_size}).") - - total_num_hidden_layers = self.hf_config.num_hidden_layers - pipeline_parallel_size = parallel_config.pipeline_parallel_size - if total_num_hidden_layers % pipeline_parallel_size != 0: - raise ValueError( - f"Total number of hidden layers ({total_num_hidden_layers}) " - "must be divisible by pipeline parallel size " - f"({pipeline_parallel_size}).") - - def get_sliding_window(self) -> Optional[int]: - return getattr(self.hf_config, "sliding_window", None) - - def get_vocab_size(self) -> int: - return self.hf_config.vocab_size - - def get_hidden_size(self) -> int: - return self.hf_config.hidden_size - - def get_head_size(self) -> int: - if hasattr(self.hf_config, "head_dim"): - return self.hf_config.head_dim - # FIXME(woosuk): This may not be true for all models. - return self.hf_config.hidden_size // self.hf_config.num_attention_heads - - def get_total_num_kv_heads(self) -> int: - """Returns the total number of KV heads.""" - # For GPTBigCode & Falcon: - # NOTE: for falcon, when new_decoder_architecture is True, the - # multi_query flag is ignored and we use n_head_kv for the number of - # KV heads. - falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"] - new_decoder_arch_falcon = ( - self.hf_config.model_type in falcon_model_types - and getattr(self.hf_config, "new_decoder_architecture", False)) - if not new_decoder_arch_falcon and getattr(self.hf_config, - "multi_query", False): - # Multi-query attention, only one KV head. - # Currently, tensor parallelism is not supported in this case. - return 1 - - attributes = [ - # For Falcon: - "n_head_kv", - "num_kv_heads", - # For LLaMA-2: - "num_key_value_heads", - # For ChatGLM: - "multi_query_group_num", - ] - for attr in attributes: - num_kv_heads = getattr(self.hf_config, attr, None) - if num_kv_heads is not None: - return num_kv_heads - - # For non-grouped-query attention models, the number of KV heads is - # equal to the number of attention heads. - return self.hf_config.num_attention_heads - - def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: - """Returns the number of KV heads per GPU.""" - total_num_kv_heads = self.get_total_num_kv_heads() - # If tensor parallelism is used, we divide the number of KV heads by - # the tensor parallel size. We will replicate the KV heads in the - # case where the number of KV heads is smaller than the tensor - # parallel size so each GPU has at least one KV head. - return max(1, - total_num_kv_heads // parallel_config.tensor_parallel_size) - - def get_num_layers(self, parallel_config: "ParallelConfig") -> int: - total_num_hidden_layers = self.hf_config.num_hidden_layers - return total_num_hidden_layers // parallel_config.pipeline_parallel_size - - -class CacheConfig: - """Configuration for the KV cache. - - Args: - block_size: Size of a cache block in number of tokens. - gpu_memory_utilization: Fraction of GPU memory to use for the - vLLM execution. - swap_space: Size of the CPU swap space per GPU (in GiB). - cache_dtype: Data type for kv cache storage. - """ - - def __init__( - self, - block_size: int, - gpu_memory_utilization: float, - swap_space: int, - cache_dtype: str, - sliding_window: Optional[int] = None, - ) -> None: - self.block_size = block_size - self.gpu_memory_utilization = gpu_memory_utilization - self.swap_space_bytes = swap_space * _GB - self.cache_dtype = cache_dtype - self.sliding_window = sliding_window - self._verify_args() - self._verify_cache_dtype() - - # Will be set after profiling. - self.num_gpu_blocks = None - self.num_cpu_blocks = None - - def _verify_args(self) -> None: - if self.gpu_memory_utilization > 1.0: - raise ValueError( - "GPU memory utilization must be less than 1.0. Got " - f"{self.gpu_memory_utilization}.") - - def _verify_cache_dtype(self) -> None: - if self.cache_dtype == "auto": - pass - elif self.cache_dtype == "fp8_e5m2": - nvcc_cuda_version = get_nvcc_cuda_version() - if nvcc_cuda_version < Version("11.8"): - raise ValueError( - "FP8 is not supported when cuda version is lower than 11.8." - ) - device_name = torch.cuda.get_device_name() - if "AMD" in device_name: - raise NotImplementedError( - "FP8_E5M2 KV Cache on AMD GPU has not been supported yet.") - logger.info( - "Using fp8_e5m2 data type to store kv cache. It reduces " - "the GPU memory footprint and boosts the performance. " - "But it may cause slight accuracy drop. " - "Currently we only support fp8 without scaling factors and " - "make e5m2 as a default format.") - else: - raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") - - def verify_with_parallel_config( - self, - parallel_config: "ParallelConfig", - ) -> None: - total_cpu_memory = get_cpu_memory() - # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel - # group are in the same node. However, the GPUs may span multiple nodes. - num_gpus_per_node = parallel_config.tensor_parallel_size - cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node - - msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of " - f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is " - "allocated for the swap space.") - if cpu_memory_usage > 0.7 * total_cpu_memory: - raise ValueError("Too large swap space. " + msg) - elif cpu_memory_usage > 0.4 * total_cpu_memory: - logger.warning("Possibly too large swap space. " + msg) - - -class ParallelConfig: - """Configuration for the distributed execution. - - Args: - pipeline_parallel_size: Number of pipeline parallel groups. - tensor_parallel_size: Number of tensor parallel groups. - worker_use_ray: Whether to use Ray for model workers. Will be set to - True if either pipeline_parallel_size or tensor_parallel_size is - greater than 1. - disable_custom_all_reduce: Disable the custom all-reduce kernel and - fall back to NCCL. - """ - - def __init__( - self, - pipeline_parallel_size: int, - tensor_parallel_size: int, - worker_use_ray: bool, - max_parallel_loading_workers: Optional[int] = None, - disable_custom_all_reduce: bool = False, - ) -> None: - self.pipeline_parallel_size = pipeline_parallel_size - self.tensor_parallel_size = tensor_parallel_size - self.worker_use_ray = worker_use_ray - self.max_parallel_loading_workers = max_parallel_loading_workers - self.disable_custom_all_reduce = disable_custom_all_reduce - - self.world_size = pipeline_parallel_size * tensor_parallel_size - if self.world_size > 1: - self.worker_use_ray = True - self._verify_args() - - def _verify_args(self) -> None: - if self.pipeline_parallel_size > 1: - raise NotImplementedError( - "Pipeline parallelism is not supported yet.") - if is_hip(): - self.disable_custom_all_reduce = True - logger.info( - "Disabled the custom all-reduce kernel because it is not " - "supported on AMD GPUs.") - elif self.pipeline_parallel_size > 1: - self.disable_custom_all_reduce = True - logger.info( - "Disabled the custom all-reduce kernel because it is not " - "supported with pipeline parallelism.") - - -class SchedulerConfig: - """Scheduler configuration. - - Args: - max_num_batched_tokens: Maximum number of tokens to be processed in - a single iteration. - max_num_seqs: Maximum number of sequences to be processed in a single - iteration. - max_model_len: Maximum length of a sequence (including prompt - and generated text). - max_paddings: Maximum number of paddings to be added to a batch. - """ - - def __init__( - self, - max_num_batched_tokens: Optional[int], - max_num_seqs: int, - max_model_len: int, - max_paddings: int, - ) -> None: - if max_num_batched_tokens is not None: - self.max_num_batched_tokens = max_num_batched_tokens - else: - # If max_model_len is too short, use 2048 as the default value for - # higher throughput. - self.max_num_batched_tokens = max(max_model_len, 2048) - self.max_num_seqs = max_num_seqs - self.max_model_len = max_model_len - self.max_paddings = max_paddings - self._verify_args() - - def _verify_args(self) -> None: - if self.max_num_batched_tokens < self.max_model_len: - raise ValueError( - f"max_num_batched_tokens ({self.max_num_batched_tokens}) is " - f"smaller than max_model_len ({self.max_model_len}). " - "This effectively limits the maximum sequence length to " - "max_num_batched_tokens and makes vLLM reject longer " - "sequences. Please increase max_num_batched_tokens or " - "decrease max_model_len.") - if self.max_num_batched_tokens < self.max_num_seqs: - raise ValueError( - f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " - "be greater than or equal to max_num_seqs " - f"({self.max_num_seqs}).") - - -@dataclass -class LoRAConfig: - max_lora_rank: int - max_loras: int - max_cpu_loras: Optional[int] = None - lora_dtype: Optional[torch.dtype] = None - lora_extra_vocab_size: int = 256 - # This is a constant. - lora_vocab_padding_size: ClassVar[int] = 256 - - def __post_init__(self): - # Keep this in sync with csrc/punica/bgmv/bgmv_config.h - possible_max_ranks = (8, 16, 32, 64) - possible_lora_extra_vocab_size = (0, 256, 512) - if self.max_lora_rank not in possible_max_ranks: - raise ValueError( - f"max_lora_rank ({self.max_lora_rank}) must be one of " - f"{possible_max_ranks}.") - if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size: - raise ValueError( - f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) " - f"must be one of {possible_lora_extra_vocab_size}.") - if self.max_loras < 1: - raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.") - if self.max_cpu_loras is None: - self.max_cpu_loras = self.max_loras - elif self.max_cpu_loras < self.max_loras: - raise ValueError( - f"max_cpu_loras ({self.max_cpu_loras}) must be >= " - f"max_num_seqs ({self.max_loras})") - - def verify_with_model_config(self, model_config: ModelConfig): - if self.lora_dtype in (None, "auto"): - self.lora_dtype = model_config.dtype - elif isinstance(self.lora_dtype, str): - self.lora_dtype = getattr(torch, self.lora_dtype) - if model_config.quantization is not None: - raise ValueError( - "LoRA is not supported with quantized models yet.") - - def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): - if scheduler_config.max_num_batched_tokens > 65528: - raise ValueError( - "Due to limitations of the custom LoRA CUDA kernel, " - "max_num_batched_tokens must be <= 65528 when " - "LoRA is enabled.") - - -_STR_DTYPE_TO_TORCH_DTYPE = { - "half": torch.float16, - "float16": torch.float16, - "float": torch.float32, - "float32": torch.float32, - "bfloat16": torch.bfloat16, -} - -_ROCM_NOT_SUPPORTED_DTYPE = ["float", "float32"] - - -def _get_and_verify_dtype( - config: PretrainedConfig, - dtype: Union[str, torch.dtype], -) -> torch.dtype: - # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct - # because config.torch_dtype can be None. - config_dtype = getattr(config, "torch_dtype", None) - if config_dtype is None: - config_dtype = torch.float32 - - if isinstance(dtype, str): - dtype = dtype.lower() - if dtype == "auto": - if config_dtype == torch.float32: - # Following the common practice, we use float16 for float32 - # models. - torch_dtype = torch.float16 - else: - torch_dtype = config_dtype - else: - if dtype not in _STR_DTYPE_TO_TORCH_DTYPE: - raise ValueError(f"Unknown dtype: {dtype}") - torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] - elif isinstance(dtype, torch.dtype): - torch_dtype = dtype - else: - raise ValueError(f"Unknown dtype: {dtype}") - - if is_hip() and torch_dtype == torch.float32: - rocm_supported_dtypes = [ - k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items() - if (k not in _ROCM_NOT_SUPPORTED_DTYPE) - ] - raise ValueError(f"dtype \'{dtype}\' is not supported in ROCm. " - f"Supported dtypes are {rocm_supported_dtypes}") - - # Verify the dtype. - if torch_dtype != config_dtype: - if torch_dtype == torch.float32: - # Upcasting to float32 is allowed. - pass - elif config_dtype == torch.float32: - # Downcasting from float32 to float16 or bfloat16 is allowed. - pass - else: - # Casting between float16 and bfloat16 is allowed with a warning. - logger.warning(f"Casting {config_dtype} to {torch_dtype}.") - - return torch_dtype - - -def _get_and_verify_max_len( - hf_config: PretrainedConfig, - max_model_len: Optional[int], -) -> int: - """Get and verify the model's maximum length.""" - derived_max_model_len = float("inf") - possible_keys = [ - # OPT - "max_position_embeddings", - # GPT-2 - "n_positions", - # MPT - "max_seq_len", - # ChatGLM2 - "seq_length", - # Others - "max_sequence_length", - "max_seq_length", - "seq_len", - ] - for key in possible_keys: - max_len_key = getattr(hf_config, key, None) - if max_len_key is not None: - derived_max_model_len = min(derived_max_model_len, max_len_key) - if derived_max_model_len == float("inf"): - if max_model_len is not None: - # If max_model_len is specified, we use it. - return max_model_len - - default_max_len = 2048 - logger.warning( - "The model's config.json does not contain any of the following " - "keys to determine the original maximum length of the model: " - f"{possible_keys}. Assuming the model's maximum length is " - f"{default_max_len}.") - derived_max_model_len = default_max_len - - rope_scaling = getattr(hf_config, "rope_scaling", None) - if rope_scaling is not None: - assert "factor" in rope_scaling - scaling_factor = rope_scaling["factor"] - if rope_scaling["type"] == "yarn": - derived_max_model_len = rope_scaling[ - "original_max_position_embeddings"] - derived_max_model_len *= scaling_factor - - if max_model_len is None: - max_model_len = derived_max_model_len - elif max_model_len > derived_max_model_len: - raise ValueError( - f"User-specified max_model_len ({max_model_len}) is greater than " - f"the derived max_model_len ({max_len_key}={derived_max_model_len}" - " in model's config.json). This may lead to incorrect model " - "outputs or CUDA errors. Make sure the value is correct and " - "within the model context size.") - return int(max_model_len) diff --git a/vllm/core/__init__.py b/vllm/core/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py deleted file mode 100644 index 7f91051f03ac1711e83be21b164d7cc4814302ce..0000000000000000000000000000000000000000 --- a/vllm/core/block_manager.py +++ /dev/null @@ -1,330 +0,0 @@ -"""A block manager that manages token blocks.""" -import enum -from typing import Dict, List, Optional, Set, Tuple - -from vllm.block import BlockTable, PhysicalTokenBlock -from vllm.sequence import Sequence, SequenceGroup, SequenceStatus -from vllm.utils import Device - - -class BlockAllocator: - """Manages free physical token blocks for a device. - - The allocator maintains a list of free blocks and allocates a block when - requested. When a block is freed, its reference count is decremented. If - the reference count becomes zero, the block is added back to the free list. - """ - - def __init__( - self, - device: Device, - block_size: int, - num_blocks: int, - ) -> None: - self.device = device - self.block_size = block_size - self.num_blocks = num_blocks - - # Initialize the free blocks. - self.free_blocks: BlockTable = [] - for i in range(num_blocks): - block = PhysicalTokenBlock(device=device, - block_number=i, - block_size=block_size) - self.free_blocks.append(block) - - def allocate(self) -> PhysicalTokenBlock: - if not self.free_blocks: - raise ValueError("Out of memory! No free blocks are available.") - block = self.free_blocks.pop() - block.ref_count = 1 - return block - - def free(self, block: PhysicalTokenBlock) -> None: - if block.ref_count == 0: - raise ValueError(f"Double free! {block} is already freed.") - block.ref_count -= 1 - if block.ref_count == 0: - self.free_blocks.append(block) - - def get_num_free_blocks(self) -> int: - return len(self.free_blocks) - - -class AllocStatus(enum.Enum): - """Result for BlockSpaceManager.can_allocate - - 1. Ok: seq_group can be allocated now. - 2. Later: seq_group cannot be allocated. - The capacity of allocator is larger than seq_group required. - 3. Never: seq_group can never be allocated. - The seq_group is too large to allocated in GPU. - """ - OK = enum.auto() - LATER = enum.auto() - NEVER = enum.auto() - - -class BlockSpaceManager: - """Manages the mapping between logical and physical token blocks.""" - - def __init__( - self, - block_size: int, - num_gpu_blocks: int, - num_cpu_blocks: int, - watermark: float = 0.01, - sliding_window: Optional[int] = None, - ) -> None: - self.block_size = block_size - self.num_total_gpu_blocks = num_gpu_blocks - self.num_total_cpu_blocks = num_cpu_blocks - - self.block_sliding_window = None - if sliding_window is not None: - assert sliding_window % block_size == 0, (sliding_window, - block_size) - self.block_sliding_window = sliding_window // block_size - - self.watermark = watermark - assert watermark >= 0.0 - - self.watermark_blocks = int(watermark * num_gpu_blocks) - self.gpu_allocator = BlockAllocator(Device.GPU, block_size, - num_gpu_blocks) - self.cpu_allocator = BlockAllocator(Device.CPU, block_size, - num_cpu_blocks) - # Mapping: seq_id -> BlockTable. - self.block_tables: Dict[int, BlockTable] = {} - - def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: - # FIXME(woosuk): Here we assume that all sequences in the group share - # the same prompt. This may not be true for preempted sequences. - seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] - num_required_blocks = len(seq.logical_token_blocks) - - if seq_group.prefix is not None and seq_group.prefix.allocated: - num_required_blocks -= seq_group.prefix.get_num_blocks() - - if self.block_sliding_window is not None: - num_required_blocks = min(num_required_blocks, - self.block_sliding_window) - num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks() - - # Use watermark to avoid frequent cache eviction. - if (self.num_total_gpu_blocks - num_required_blocks < - self.watermark_blocks): - return AllocStatus.NEVER - if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks: - return AllocStatus.OK - else: - return AllocStatus.LATER - - def allocate(self, seq_group: SequenceGroup) -> None: - # NOTE: Here we assume that all sequences in the group have the same - # prompt. - seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] - - # Allocate new physical token blocks that will store the prompt tokens. - num_prompt_blocks = len(seq.logical_token_blocks) - - block_table: BlockTable = [] - prefix_block_table: BlockTable = [] - num_prefix_blocks = 0 - - prefix = seq_group.prefix - if prefix is not None and prefix.allocated: - # Prefix has already been allocated. Use the existing block table. - num_prompt_blocks -= prefix.get_num_blocks() - for block in prefix.block_table: - block.ref_count += seq_group.num_seqs() - block_table.append(block) - - for logical_idx in range(num_prompt_blocks): - if (self.block_sliding_window is not None - and logical_idx >= self.block_sliding_window): - block = block_table[logical_idx % self.block_sliding_window] - else: - block = self.gpu_allocator.allocate() - # Set the reference counts of the token blocks. - block.ref_count = seq_group.num_seqs() - block_table.append(block) - - if prefix is not None and not prefix.allocated: - # Allocate blocks for the prefix, we will compute the prefix's - # KV cache in this run. - num_prefix_blocks = prefix.get_num_blocks() - prefix_block_table = block_table[:num_prefix_blocks] - for block in prefix_block_table: - block.ref_count += 1 - prefix.set_block_table(prefix_block_table) - - # Assign the block table for each sequence. - for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): - self.block_tables[seq.seq_id] = block_table.copy() - - def can_append_slot(self, seq_group: SequenceGroup) -> bool: - # Simple heuristic: If there is at least one free block - # for each sequence, we can append. - num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks() - num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING) - return num_seqs <= num_free_gpu_blocks - - def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]: - """Allocate a physical slot for a new token.""" - logical_blocks = seq.logical_token_blocks - block_table = self.block_tables[seq.seq_id] - - if len(block_table) < len(logical_blocks): - if (self.block_sliding_window - and len(block_table) >= self.block_sliding_window): - # re-use a block - block_table.append(block_table[len(block_table) % - self.block_sliding_window]) - else: - # The sequence has a new logical block. - # Allocate a new physical block. - block = self.gpu_allocator.allocate() - block_table.append(block) - return None - - # We want to append the token to the last physical block. - last_block = block_table[-1] - assert last_block.device == Device.GPU - if last_block.ref_count == 1: - # Not shared with other sequences. Appendable. - return None - else: - # The last block is shared with other sequences. - # Copy on Write: Allocate a new block and copy the tokens. - new_block = self.gpu_allocator.allocate() - block_table[-1] = new_block - self.gpu_allocator.free(last_block) - return last_block.block_number, new_block.block_number - - def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: - # NOTE: fork does not allocate a new physical block. - # Thus, it is always safe from OOM. - src_block_table = self.block_tables[parent_seq.seq_id] - self.block_tables[child_seq.seq_id] = src_block_table.copy() - for block in src_block_table: - block.ref_count += 1 - - def _get_physical_blocks( - self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]: - # NOTE: Here, we assume that the physical blocks are only shared by - # the sequences in the same group. - blocks: Set[PhysicalTokenBlock] = set() - for seq in seq_group.get_seqs(): - if seq.is_finished(): - continue - blocks.update(self.block_tables[seq.seq_id]) - return list(blocks) - - def can_swap_in(self, seq_group: SequenceGroup) -> bool: - blocks = self._get_physical_blocks(seq_group) - num_swapped_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED) - num_free_blocks = self.gpu_allocator.get_num_free_blocks() - # NOTE: Conservatively, we assume that every sequence will allocate - # at least one free block right after the swap-in. - # NOTE: This should match the logic in can_append_slot(). - num_required_blocks = len(blocks) + num_swapped_seqs - return num_free_blocks - num_required_blocks >= self.watermark_blocks - - def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: - # CPU block -> GPU block. - if seq_group.prefix is not None: - # make sure to swap in the prefix first - assert seq_group.prefix.allocated and seq_group.prefix.computed - - mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} - for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): - new_block_table: BlockTable = [] - block_table = self.block_tables[seq.seq_id] - if seq_group.prefix is not None: - for block in seq_group.prefix.block_table: - new_block_table.append(block) - block.ref_count += 1 - - for cpu_block in block_table: - if cpu_block in mapping: - gpu_block = mapping[cpu_block] - gpu_block.ref_count += 1 - else: - gpu_block = self.gpu_allocator.allocate() - mapping[cpu_block] = gpu_block - new_block_table.append(gpu_block) - # Free the CPU block swapped in to GPU. - self.cpu_allocator.free(cpu_block) - self.block_tables[seq.seq_id] = new_block_table - - block_number_mapping = { - cpu_block.block_number: gpu_block.block_number - for cpu_block, gpu_block in mapping.items() - } - return block_number_mapping - - def can_swap_out(self, seq_group: SequenceGroup) -> bool: - blocks = self._get_physical_blocks(seq_group) - return len(blocks) <= self.cpu_allocator.get_num_free_blocks() - - def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: - # GPU block -> CPU block. - mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - new_block_table: BlockTable = [] - block_table = self.block_tables[seq.seq_id] - - for gpu_block in block_table: - if (seq_group.prefix is not None - and gpu_block in seq_group.prefix.block_table): - # NOTE: We do not swap out the prefix blocks for now. - self.gpu_allocator.free(gpu_block) - continue - - if gpu_block in mapping: - cpu_block = mapping[gpu_block] - cpu_block.ref_count += 1 - else: - cpu_block = self.cpu_allocator.allocate() - mapping[gpu_block] = cpu_block - new_block_table.append(cpu_block) - # Free the GPU block swapped out to CPU. - self.gpu_allocator.free(gpu_block) - self.block_tables[seq.seq_id] = new_block_table - - block_number_mapping = { - gpu_block.block_number: cpu_block.block_number - for gpu_block, cpu_block in mapping.items() - } - return block_number_mapping - - def _free_block_table(self, block_table: BlockTable) -> None: - for block in set(block_table): - if block.device == Device.GPU: - self.gpu_allocator.free(block) - else: - self.cpu_allocator.free(block) - - def free(self, seq: Sequence) -> None: - if seq.seq_id not in self.block_tables: - # Already freed or haven't been scheduled yet. - return - block_table = self.block_tables[seq.seq_id] - self._free_block_table(block_table) - del self.block_tables[seq.seq_id] - - def reset(self) -> None: - for block_table in self.block_tables.values(): - self._free_block_table(block_table) - self.block_tables.clear() - - def get_block_table(self, seq: Sequence) -> List[int]: - block_table = self.block_tables[seq.seq_id] - return [block.block_number for block in block_table] - - def get_num_free_gpu_blocks(self) -> int: - return self.gpu_allocator.get_num_free_blocks() - - def get_num_free_cpu_blocks(self) -> int: - return self.cpu_allocator.get_num_free_blocks() diff --git a/vllm/core/policy.py b/vllm/core/policy.py deleted file mode 100644 index 99f183b42c8b49603f783c42ed4d664c10e1ce63..0000000000000000000000000000000000000000 --- a/vllm/core/policy.py +++ /dev/null @@ -1,47 +0,0 @@ -from collections import deque -from typing import Deque - -from vllm.sequence import SequenceGroup - - -class Policy: - - def get_priority( - self, - now: float, - seq_group: SequenceGroup, - ) -> float: - raise NotImplementedError - - def sort_by_priority( - self, - now: float, - seq_groups: Deque[SequenceGroup], - ) -> Deque[SequenceGroup]: - return deque( - sorted( - seq_groups, - key=lambda seq_group: self.get_priority(now, seq_group), - reverse=True, - )) - - -class FCFS(Policy): - - def get_priority( - self, - now: float, - seq_group: SequenceGroup, - ) -> float: - return now - seq_group.arrival_time - - -class PolicyFactory: - - _POLICY_REGISTRY = { - 'fcfs': FCFS, - } - - @classmethod - def get_policy(cls, policy_name: str, **kwargs) -> Policy: - return cls._POLICY_REGISTRY[policy_name](**kwargs) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py deleted file mode 100644 index 4fdf9ec341cfd2dc95c0b5080b1971859806b686..0000000000000000000000000000000000000000 --- a/vllm/core/scheduler.py +++ /dev/null @@ -1,494 +0,0 @@ -from collections import deque -import enum -import time -from typing import Deque, Dict, Iterable, List, Optional, Tuple, Union, Set - -from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig -from vllm.core.block_manager import AllocStatus, BlockSpaceManager -from vllm.core.policy import PolicyFactory -from vllm.lora.request import LoRARequest -from vllm.logger import init_logger -from vllm.sequence import (Sequence, SequenceData, SequenceGroup, - SequenceGroupMetadata, SequenceStatus) -from vllm.prefix import PrefixPool - -logger = init_logger(__name__) - - -class PreemptionMode(enum.Enum): - """Preemption modes. - - 1. Swapping: Swap out the blocks of the preempted sequences to CPU memory - and swap them back in when the sequences are resumed. - 2. Recomputation: Discard the blocks of the preempted sequences and - recompute them when the sequences are resumed, treating the sequences as - new prompts. - """ - SWAP = enum.auto() - RECOMPUTE = enum.auto() - - -class SchedulerOutputs: - - def __init__( - self, - scheduled_seq_groups: Iterable[SequenceGroup], - prompt_run: bool, - num_batched_tokens: int, - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - ignored_seq_groups: List[SequenceGroup], - ) -> None: - self.scheduled_seq_groups = scheduled_seq_groups - self.prompt_run = prompt_run - self.num_batched_tokens = num_batched_tokens - self.blocks_to_swap_in = blocks_to_swap_in - self.blocks_to_swap_out = blocks_to_swap_out - self.blocks_to_copy = blocks_to_copy - # Swap in and swap out should never happen at the same time. - assert not (blocks_to_swap_in and blocks_to_swap_out) - self.ignored_seq_groups = ignored_seq_groups - - self.num_loras = len(self.lora_requests) - if self.num_loras > 0: - self._sort_by_lora_ids() - - def is_empty(self) -> bool: - # NOTE: We do not consider the ignored sequence groups. - return (not self.scheduled_seq_groups and not self.blocks_to_swap_in - and not self.blocks_to_swap_out and not self.blocks_to_copy) - - def _sort_by_lora_ids(self) -> bool: - self.scheduled_seq_groups = sorted( - self.scheduled_seq_groups, - key=lambda g: (g.lora_request.lora_int_id - if g.lora_request else 0, g.request_id)) - - @property - def lora_requests(self) -> Set[LoRARequest]: - return {g.lora_request for g in self.scheduled_seq_groups} - - -class Scheduler: - - def __init__( - self, - scheduler_config: SchedulerConfig, - cache_config: CacheConfig, - lora_config: Optional[LoRAConfig], - ) -> None: - self.scheduler_config = scheduler_config - self.cache_config = cache_config - # Note for LoRA scheduling: the current policy is extremely - # simple and NOT fair. It can lead to starvation of some - # LoRAs. This should be improved in the future. - self.lora_config = lora_config - - self.prompt_limit = min(self.scheduler_config.max_model_len, - self.scheduler_config.max_num_batched_tokens) - - # Instantiate the scheduling policy. - self.policy = PolicyFactory.get_policy(policy_name="fcfs") - # Create the block space manager. - self.block_manager = BlockSpaceManager( - block_size=self.cache_config.block_size, - num_gpu_blocks=self.cache_config.num_gpu_blocks, - num_cpu_blocks=self.cache_config.num_cpu_blocks, - sliding_window=self.cache_config.sliding_window) - - # Create the prefix pool to cache the prefixes. - self.prefix_pool = PrefixPool(self.cache_config.block_size) - - # Sequence groups in the WAITING state. - self.waiting: Deque[SequenceGroup] = deque() - # Sequence groups in the RUNNING state. - self.running: Deque[SequenceGroup] = deque() - # Sequence groups in the SWAPPED state. - self.swapped: Deque[SequenceGroup] = deque() - - @property - def lora_enabled(self) -> bool: - return bool(self.lora_config) - - def add_seq_group(self, seq_group: SequenceGroup) -> None: - # Add sequence groups to the waiting queue. - self.waiting.append(seq_group) - - def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None: - """Aborts a sequence group with the given ID. - - Check if the sequence group with the given ID - is present in any of the state queue. - If present, remove the sequence group from the state queue. - Also, if any of the sequences in the sequence group is not finished, - free the sequence with status `FINISHED_ABORTED`. - Otherwise, do nothing. - - Args: - request_id: The ID(s) of the sequence group to abort. - """ - if isinstance(request_id, str): - request_id = (request_id, ) - request_ids = set(request_id) - for state_queue in [self.waiting, self.running, self.swapped]: - aborted_groups: List[SequenceGroup] = [] - for seq_group in state_queue: - if not request_ids: - # Using 'break' here may add two extra iterations, - # but is acceptable to reduce complexity . - break - if seq_group.request_id in request_ids: - # Appending aborted group into pending list. - aborted_groups.append(seq_group) - request_ids.remove(seq_group.request_id) - for aborted_group in aborted_groups: - # Remove the sequence group from the state queue. - state_queue.remove(aborted_group) - for seq in aborted_group.get_seqs(): - if seq.is_finished(): - continue - seq.status = SequenceStatus.FINISHED_ABORTED - self.free_seq(seq) - - def has_unfinished_seqs(self) -> bool: - return self.waiting or self.running or self.swapped - - def get_num_unfinished_seq_groups(self) -> int: - return len(self.waiting) + len(self.running) + len(self.swapped) - - def _schedule(self) -> SchedulerOutputs: - # Blocks that need to be swaped or copied before model execution. - blocks_to_swap_in: Dict[int, int] = {} - blocks_to_swap_out: Dict[int, int] = {} - blocks_to_copy: Dict[int, List[int]] = {} - - # Fix the current time. - now = time.monotonic() - - # Join waiting sequences if possible. - if not self.swapped: - ignored_seq_groups: List[SequenceGroup] = [] - scheduled: List[SequenceGroup] = [] - # The total number of sequences on the fly, including the - # requests in the generation phase. - num_curr_seqs = sum(seq_group.get_max_num_running_seqs() - for seq_group in self.running) - curr_loras = set( - seq_group.lora_int_id - for seq_group in self.running) if self.lora_enabled else None - seq_lens: List[int] = [] - - # Optimization: We do not sort the waiting queue since the preempted - # sequence groups are added to the front and the new sequence groups - # are added to the back. - leftover_waiting_sequences = deque() - while self.waiting: - seq_group = self.waiting[0] - waiting_seqs = seq_group.get_seqs( - status=SequenceStatus.WAITING) - assert len(waiting_seqs) == 1, ( - "Waiting sequence group should have only one prompt " - "sequence.") - num_prompt_tokens = waiting_seqs[0].get_len() - if num_prompt_tokens > self.prompt_limit: - logger.warning( - f"Input prompt ({num_prompt_tokens} tokens) is too long" - f" and exceeds limit of {self.prompt_limit}") - for seq in waiting_seqs: - seq.status = SequenceStatus.FINISHED_IGNORED - ignored_seq_groups.append(seq_group) - self.waiting.popleft() - continue - - # If the sequence group cannot be allocated, stop. - can_allocate = self.block_manager.can_allocate(seq_group) - if can_allocate == AllocStatus.LATER: - break - elif can_allocate == AllocStatus.NEVER: - logger.warning( - f"Input prompt ({num_prompt_tokens} tokens) is too long" - f" and exceeds the capacity of block_manager") - for seq in waiting_seqs: - seq.status = SequenceStatus.FINISHED_IGNORED - ignored_seq_groups.append(seq_group) - self.waiting.popleft() - continue - - lora_int_id = 0 - if self.lora_enabled: - lora_int_id = seq_group.lora_int_id - if lora_int_id > 0 and lora_int_id not in curr_loras and len( - curr_loras) >= self.lora_config.max_loras: - # We don't have a space for another LoRA, so - # we ignore this request for now. - leftover_waiting_sequences.appendleft(seq_group) - self.waiting.popleft() - continue - - # If the number of batched tokens exceeds the limit, stop. - new_seq_lens = seq_lens + [num_prompt_tokens] - num_batched_tokens = len(new_seq_lens) * max(new_seq_lens) - if (num_batched_tokens > - self.scheduler_config.max_num_batched_tokens): - break - - # The total number of sequences in the RUNNING state should not - # exceed the maximum number of sequences. - num_new_seqs = seq_group.get_max_num_running_seqs() - if (num_curr_seqs + num_new_seqs > - self.scheduler_config.max_num_seqs): - break - - num_paddings = num_batched_tokens - sum(new_seq_lens) - if num_paddings > self.scheduler_config.max_paddings: - break - seq_lens = new_seq_lens - - if lora_int_id > 0: - curr_loras.add(lora_int_id) - self.waiting.popleft() - self._allocate(seq_group) - self.running.append(seq_group) - num_curr_seqs += num_new_seqs - scheduled.append(seq_group) - - self.waiting.extendleft(leftover_waiting_sequences) - - if scheduled or ignored_seq_groups: - scheduler_outputs = SchedulerOutputs( - scheduled_seq_groups=scheduled, - prompt_run=True, - num_batched_tokens=len(seq_lens) * - max(seq_lens) if seq_lens else 0, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - ignored_seq_groups=ignored_seq_groups, - ) - return scheduler_outputs - - # NOTE(woosuk): Preemption happens only when there is no available slot - # to keep all the sequence groups in the RUNNING state. - # In this case, the policy is responsible for deciding which sequence - # groups to preempt. - self.running = self.policy.sort_by_priority(now, self.running) - - # Reserve new token slots for the running sequence groups. - running: Deque[SequenceGroup] = deque() - preempted: List[SequenceGroup] = [] - while self.running: - seq_group = self.running.popleft() - while not self.block_manager.can_append_slot(seq_group): - if self.running: - # Preempt the lowest-priority sequence groups. - victim_seq_group = self.running.pop() - self._preempt(victim_seq_group, blocks_to_swap_out) - preempted.append(victim_seq_group) - else: - # No other sequence groups can be preempted. - # Preempt the current sequence group. - self._preempt(seq_group, blocks_to_swap_out) - preempted.append(seq_group) - break - else: - # Append new slots to the sequence group. - self._append_slot(seq_group, blocks_to_copy) - running.append(seq_group) - self.running = running - - # Swap in the sequence groups in the SWAPPED state if possible. - self.swapped = self.policy.sort_by_priority(now, self.swapped) - if not preempted: - num_curr_seqs = sum(seq_group.get_max_num_running_seqs() - for seq_group in self.running) - curr_loras = set( - seq_group.lora_int_id - for seq_group in self.running) if self.lora_enabled else None - - leftover_swapped = deque() - - while self.swapped: - seq_group = self.swapped[0] - lora_int_id = 0 - if self.lora_enabled: - lora_int_id = seq_group.lora_int_id - if lora_int_id > 0 and lora_int_id not in curr_loras and len( - curr_loras) >= self.lora_config.max_loras: - # We don't have a space for another LoRA, so - # we ignore this request for now. - leftover_swapped.appendleft(seq_group) - self.swapped.popleft() - continue - - # If the sequence group cannot be swapped in, stop. - if not self.block_manager.can_swap_in(seq_group): - break - - # The total number of sequences in the RUNNING state should not - # exceed the maximum number of sequences. - num_new_seqs = seq_group.get_max_num_running_seqs() - if (num_curr_seqs + num_new_seqs > - self.scheduler_config.max_num_seqs): - break - - if lora_int_id > 0: - curr_loras.add(lora_int_id) - self.swapped.popleft() - self._swap_in(seq_group, blocks_to_swap_in) - self._append_slot(seq_group, blocks_to_copy) - num_curr_seqs += num_new_seqs - self.running.append(seq_group) - - self.swapped.extendleft(leftover_swapped) - - # Each sequence in the generation phase only takes one token slot. - # Therefore, the number of batched tokens is equal to the number of - # sequences in the RUNNING state. - num_batched_tokens = sum( - seq_group.num_seqs(status=SequenceStatus.RUNNING) - for seq_group in self.running) - - scheduler_outputs = SchedulerOutputs( - scheduled_seq_groups=self.running, - prompt_run=False, - num_batched_tokens=num_batched_tokens, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - ignored_seq_groups=[], - ) - return scheduler_outputs - - def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: - # Schedule sequence groups. - # This function call changes the internal states of the scheduler - # such as self.running, self.swapped, and self.waiting. - scheduler_outputs = self._schedule() - - # Create input data structures. - seq_group_metadata_list: List[SequenceGroupMetadata] = [] - for seq_group in scheduler_outputs.scheduled_seq_groups: - seq_data: Dict[int, SequenceData] = {} - block_tables: Dict[int, List[int]] = {} - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - seq_id = seq.seq_id - seq_data[seq_id] = seq.data - block_tables[seq_id] = self.block_manager.get_block_table(seq) - - seq_group_metadata = SequenceGroupMetadata( - request_id=seq_group.request_id, - is_prompt=scheduler_outputs.prompt_run, - seq_data=seq_data, - sampling_params=seq_group.sampling_params, - block_tables=block_tables, - lora_request=seq_group.lora_request, - prefix=seq_group.prefix, - ) - seq_group_metadata_list.append(seq_group_metadata) - return seq_group_metadata_list, scheduler_outputs - - def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None: - self.block_manager.fork(parent_seq, child_seq) - - def free_seq(self, seq: Sequence) -> None: - self.block_manager.free(seq) - - def free_finished_seq_groups(self) -> None: - self.running = deque(seq_group for seq_group in self.running - if not seq_group.is_finished()) - - def _allocate(self, seq_group: SequenceGroup) -> None: - self.block_manager.allocate(seq_group) - for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): - seq.status = SequenceStatus.RUNNING - - def _append_slot( - self, - seq_group: SequenceGroup, - blocks_to_copy: Dict[int, List[int]], - ) -> None: - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - ret = self.block_manager.append_slot(seq) - if ret is not None: - src_block, dst_block = ret - if src_block in blocks_to_copy: - blocks_to_copy[src_block].append(dst_block) - else: - blocks_to_copy[src_block] = [dst_block] - - def _preempt( - self, - seq_group: SequenceGroup, - blocks_to_swap_out: Dict[int, int], - preemption_mode: Optional[PreemptionMode] = None, - ) -> None: - # If preemption mode is not specified, we determine the mode as follows: - # We use recomputation by default since it incurs lower overhead than - # swapping. However, when the sequence group has multiple sequences - # (e.g., beam search), recomputation is not currently supported. In - # such a case, we use swapping instead. - # FIXME(woosuk): This makes our scheduling policy a bit bizarre. - # As swapped sequences are prioritized over waiting sequences, - # sequence groups with multiple sequences are implicitly prioritized - # over sequence groups with a single sequence. - # TODO(woosuk): Support recomputation for sequence groups with multiple - # sequences. This may require a more sophisticated CUDA kernel. - if preemption_mode is None: - if seq_group.get_max_num_running_seqs() == 1: - preemption_mode = PreemptionMode.RECOMPUTE - else: - preemption_mode = PreemptionMode.SWAP - if preemption_mode == PreemptionMode.RECOMPUTE: - self._preempt_by_recompute(seq_group) - elif preemption_mode == PreemptionMode.SWAP: - self._preempt_by_swap(seq_group, blocks_to_swap_out) - else: - raise AssertionError("Invalid preemption mode.") - - def _preempt_by_recompute( - self, - seq_group: SequenceGroup, - ) -> None: - seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - assert len(seqs) == 1 - for seq in seqs: - seq.status = SequenceStatus.WAITING - self.block_manager.free(seq) - # NOTE: For FCFS, we insert the preempted sequence group to the front - # of the waiting queue. - self.waiting.appendleft(seq_group) - - def _preempt_by_swap( - self, - seq_group: SequenceGroup, - blocks_to_swap_out: Dict[int, int], - ) -> None: - self._swap_out(seq_group, blocks_to_swap_out) - self.swapped.append(seq_group) - - def _swap_in( - self, - seq_group: SequenceGroup, - blocks_to_swap_in: Dict[int, int], - ) -> None: - mapping = self.block_manager.swap_in(seq_group) - blocks_to_swap_in.update(mapping) - for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): - seq.status = SequenceStatus.RUNNING - - def _swap_out( - self, - seq_group: SequenceGroup, - blocks_to_swap_out: Dict[int, int], - ) -> None: - if not self.block_manager.can_swap_out(seq_group): - # FIXME(woosuk): Abort the sequence group instead of aborting the - # entire engine. - raise RuntimeError( - "Aborted due to the lack of CPU swap space. Please increase " - "the swap space to avoid this error.") - mapping = self.block_manager.swap_out(seq_group) - blocks_to_swap_out.update(mapping) - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - seq.status = SequenceStatus.SWAPPED diff --git a/vllm/engine/__init__.py b/vllm/engine/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py deleted file mode 100644 index 231ce3321cdc40e32b620fa9c3939e88a1c1b1b6..0000000000000000000000000000000000000000 --- a/vllm/engine/arg_utils.py +++ /dev/null @@ -1,326 +0,0 @@ -import argparse -import dataclasses -from dataclasses import dataclass -from typing import Optional, Tuple - -from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, - SchedulerConfig, LoRAConfig) - - -@dataclass -class EngineArgs: - """Arguments for vLLM engine.""" - model: str - tokenizer: Optional[str] = None - tokenizer_mode: str = 'auto' - trust_remote_code: bool = False - download_dir: Optional[str] = None - load_format: str = 'auto' - dtype: str = 'auto' - kv_cache_dtype: str = 'auto' - seed: int = 0 - max_model_len: Optional[int] = None - worker_use_ray: bool = False - pipeline_parallel_size: int = 1 - tensor_parallel_size: int = 1 - max_parallel_loading_workers: Optional[int] = None - block_size: int = 16 - swap_space: int = 4 # GiB - gpu_memory_utilization: float = 0.90 - max_num_batched_tokens: Optional[int] = None - max_num_seqs: int = 256 - max_paddings: int = 256 - disable_log_stats: bool = False - revision: Optional[str] = None - tokenizer_revision: Optional[str] = None - quantization: Optional[str] = None - enforce_eager: bool = False - max_context_len_to_capture: int = 8192 - disable_custom_all_reduce: bool = False - enable_lora: bool = False - max_loras: int = 1 - max_lora_rank: int = 16 - lora_extra_vocab_size: int = 256 - lora_dtype = 'auto' - max_cpu_loras: Optional[int] = None - - def __post_init__(self): - if self.tokenizer is None: - self.tokenizer = self.model - - @staticmethod - def add_cli_args( - parser: argparse.ArgumentParser) -> argparse.ArgumentParser: - """Shared CLI arguments for vLLM engine.""" - - # NOTE: If you update any of the arguments below, please also - # make sure to update docs/source/models/engine_args.rst - - # Model arguments - parser.add_argument( - '--model', - type=str, - default='facebook/opt-125m', - help='name or path of the huggingface model to use') - parser.add_argument( - '--tokenizer', - type=str, - default=EngineArgs.tokenizer, - help='name or path of the huggingface tokenizer to use') - parser.add_argument( - '--revision', - type=str, - default=None, - help='the specific model version to use. It can be a branch ' - 'name, a tag name, or a commit id. If unspecified, will use ' - 'the default version.') - parser.add_argument( - '--tokenizer-revision', - type=str, - default=None, - help='the specific tokenizer version to use. It can be a branch ' - 'name, a tag name, or a commit id. If unspecified, will use ' - 'the default version.') - parser.add_argument('--tokenizer-mode', - type=str, - default=EngineArgs.tokenizer_mode, - choices=['auto', 'slow'], - help='tokenizer mode. "auto" will use the fast ' - 'tokenizer if available, and "slow" will ' - 'always use the slow tokenizer.') - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument('--download-dir', - type=str, - default=EngineArgs.download_dir, - help='directory to download and load the weights, ' - 'default to the default cache dir of ' - 'huggingface') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'], - help='The format of the model weights to load. ' - '"auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available. ' - '"pt" will load the weights in the pytorch bin format. ' - '"safetensors" will load the weights in the safetensors format. ' - '"npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading. ' - '"dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.') - parser.add_argument( - '--dtype', - type=str, - default=EngineArgs.dtype, - choices=[ - 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32' - ], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8_e5m2'], - default='auto', - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. Note FP8 is not supported when cuda version is ' - 'lower than 11.8.') - parser.add_argument('--max-model-len', - type=int, - default=None, - help='model context length. If unspecified, ' - 'will be automatically derived from the model.') - # Parallel arguments - parser.add_argument('--worker-use-ray', - action='store_true', - help='use Ray for distributed serving, will be ' - 'automatically set when using more than 1 GPU') - parser.add_argument('--pipeline-parallel-size', - '-pp', - type=int, - default=EngineArgs.pipeline_parallel_size, - help='number of pipeline stages') - parser.add_argument('--tensor-parallel-size', - '-tp', - type=int, - default=EngineArgs.tensor_parallel_size, - help='number of tensor parallel replicas') - parser.add_argument( - '--max-parallel-loading-workers', - type=int, - help='load model sequentially in multiple batches, ' - 'to avoid RAM OOM when using tensor ' - 'parallel and large models') - # KV cache arguments - parser.add_argument('--block-size', - type=int, - default=EngineArgs.block_size, - choices=[8, 16, 32], - help='token block size') - # TODO(woosuk): Support fine-grained seeds (e.g., seed per request). - parser.add_argument('--seed', - type=int, - default=EngineArgs.seed, - help='random seed') - parser.add_argument('--swap-space', - type=int, - default=EngineArgs.swap_space, - help='CPU swap space size (GiB) per GPU') - parser.add_argument( - '--gpu-memory-utilization', - type=float, - default=EngineArgs.gpu_memory_utilization, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument('--max-num-batched-tokens', - type=int, - default=EngineArgs.max_num_batched_tokens, - help='maximum number of batched tokens per ' - 'iteration') - parser.add_argument('--max-num-seqs', - type=int, - default=EngineArgs.max_num_seqs, - help='maximum number of sequences per iteration') - parser.add_argument('--max-paddings', - type=int, - default=EngineArgs.max_paddings, - help='maximum number of paddings in a batch') - parser.add_argument('--disable-log-stats', - action='store_true', - help='disable logging statistics') - # Quantization settings. - parser.add_argument('--quantization', - '-q', - type=str, - choices=['awq', 'gptq', 'squeezellm', None], - default=None, - help='Method used to quantize the weights. If ' - 'None, we first check the `quantization_config` ' - 'attribute in the model config file. If that is ' - 'None, we assume the model weights are not ' - 'quantized and use `dtype` to determine the data ' - 'type of the weights.') - parser.add_argument('--enforce-eager', - action='store_true', - help='Always use eager-mode PyTorch. If False, ' - 'will use eager mode and CUDA graph in hybrid ' - 'for maximal performance and flexibility.') - parser.add_argument('--max-context-len-to-capture', - type=int, - default=EngineArgs.max_context_len_to_capture, - help='maximum context length covered by CUDA ' - 'graphs. When a sequence has context length ' - 'larger than this, we fall back to eager mode.') - parser.add_argument('--disable-custom-all-reduce', - action='store_true', - default=EngineArgs.disable_custom_all_reduce, - help='See ParallelConfig') - # LoRA related configs - parser.add_argument('--enable-lora', - action='store_true', - help='If True, enable handling of LoRA adapters.') - parser.add_argument('--max-loras', - type=int, - default=EngineArgs.max_loras, - help='Max number of LoRAs in a single batch.') - parser.add_argument('--max-lora-rank', - type=int, - default=EngineArgs.max_lora_rank, - help='Max LoRA rank.') - parser.add_argument( - '--lora-extra-vocab-size', - type=int, - default=EngineArgs.lora_extra_vocab_size, - help=('Maximum size of extra vocabulary that can be ' - 'present in a LoRA adapter (added to the base ' - 'model vocabulary).')) - parser.add_argument( - '--lora-dtype', - type=str, - default=EngineArgs.lora_dtype, - choices=['auto', 'float16', 'bfloat16', 'float32'], - help=('Data type for LoRA. If auto, will default to ' - 'base model dtype.')) - parser.add_argument( - '--max-cpu-loras', - type=int, - default=EngineArgs.max_cpu_loras, - help=('Maximum number of LoRAs to store in CPU memory. ' - 'Must be >= than max_num_seqs. ' - 'Defaults to max_num_seqs.')) - return parser - - @classmethod - def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs': - # Get the list of attributes of this dataclass. - attrs = [attr.name for attr in dataclasses.fields(cls)] - # Set the attributes from the parsed arguments. - engine_args = cls(**{attr: getattr(args, attr) for attr in attrs}) - return engine_args - - def create_engine_configs( - self, - ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig, - Optional[LoRAConfig]]: - model_config = ModelConfig(self.model, self.tokenizer, - self.tokenizer_mode, self.trust_remote_code, - self.download_dir, self.load_format, - self.dtype, self.seed, self.revision, - self.tokenizer_revision, self.max_model_len, - self.quantization, self.enforce_eager, - self.max_context_len_to_capture) - cache_config = CacheConfig(self.block_size, - self.gpu_memory_utilization, - self.swap_space, self.kv_cache_dtype, - model_config.get_sliding_window()) - parallel_config = ParallelConfig(self.pipeline_parallel_size, - self.tensor_parallel_size, - self.worker_use_ray, - self.max_parallel_loading_workers, - self.disable_custom_all_reduce) - scheduler_config = SchedulerConfig(self.max_num_batched_tokens, - self.max_num_seqs, - model_config.max_model_len, - self.max_paddings) - lora_config = LoRAConfig( - max_lora_rank=self.max_lora_rank, - max_loras=self.max_loras, - lora_extra_vocab_size=self.lora_extra_vocab_size, - lora_dtype=self.lora_dtype, - max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras - and self.max_cpu_loras > 0 else None) if self.enable_lora else None - return model_config, cache_config, parallel_config, scheduler_config, lora_config - - -@dataclass -class AsyncEngineArgs(EngineArgs): - """Arguments for asynchronous vLLM engine.""" - engine_use_ray: bool = False - disable_log_requests: bool = False - max_log_len: Optional[int] = None - - @staticmethod - def add_cli_args( - parser: argparse.ArgumentParser) -> argparse.ArgumentParser: - parser = EngineArgs.add_cli_args(parser) - parser.add_argument('--engine-use-ray', - action='store_true', - help='use Ray to start the LLM engine in a ' - 'separate process as the server process.') - parser.add_argument('--disable-log-requests', - action='store_true', - help='disable logging requests') - parser.add_argument('--max-log-len', - type=int, - default=None, - help='max number of prompt characters or prompt ' - 'ID numbers being printed in log. ' - 'Default: unlimited.') - return parser diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py deleted file mode 100644 index 2885aab9f31611813dcd2e44674f20119058baa2..0000000000000000000000000000000000000000 --- a/vllm/engine/async_llm_engine.py +++ /dev/null @@ -1,629 +0,0 @@ -import asyncio -import time -from functools import partial -from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type, - Union, AsyncIterator) - -from vllm.lora.request import LoRARequest -from vllm.config import ModelConfig -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.llm_engine import LLMEngine -from vllm.engine.ray_utils import initialize_cluster, ray -from vllm.logger import init_logger -from vllm.outputs import RequestOutput -from vllm.sampling_params import SamplingParams - -logger = init_logger(__name__) - - -class AsyncEngineDeadError(RuntimeError): - pass - - -def _raise_exception_on_finish(task: asyncio.Task, - request_tracker: "RequestTracker") -> None: - msg = ("Task finished unexpectedly. This should never happen! " - "Please open an issue on Github.") - try: - try: - task.result() - except asyncio.CancelledError: - return - except Exception as exc: - raise AsyncEngineDeadError( - msg + " See stack trace above for the actual cause.") from exc - raise AsyncEngineDeadError(msg) - except Exception as exc: - request_tracker.propagate_exception(exc) - raise exc - - -class AsyncStream: - """A stream of RequestOutputs for a request that can be - iterated over asynchronously.""" - - def __init__(self, request_id: str) -> None: - self.request_id = request_id - self._queue = asyncio.Queue() - self._finished = False - - def put(self, item: RequestOutput) -> None: - if self._finished: - return - self._queue.put_nowait(item) - - def finish(self) -> None: - self._queue.put_nowait(StopAsyncIteration()) - self._finished = True - - @property - def finished(self) -> bool: - return self._finished - - def __aiter__(self): - return self - - async def __anext__(self) -> RequestOutput: - result = await self._queue.get() - if isinstance(result, Exception): - raise result - return result - - -class RequestTracker: - """Synchronous abstraction for tracking requests.""" - - def __init__(self) -> None: - self._request_streams: Dict[str, AsyncStream] = {} - self._finished_requests: asyncio.Queue[str] = asyncio.Queue() - self._new_requests: asyncio.Queue[Tuple[AsyncStream, - dict]] = asyncio.Queue() - self.new_requests_event = None - - def __contains__(self, item): - return item in self._request_streams - - def init_event(self): - self.new_requests_event = asyncio.Event() - - def propagate_exception(self, - exc: Exception, - request_id: Optional[str] = None) -> None: - """Propagate an exception to request streams - (all if request_id is None).""" - if request_id is not None: - self._request_streams[request_id].put(exc) - else: - for stream in self._request_streams.values(): - stream.put(exc) - - def process_request_output(self, - request_output: RequestOutput, - *, - verbose: bool = False) -> None: - """Process a request output from the engine.""" - request_id = request_output.request_id - - self._request_streams[request_id].put(request_output) - if request_output.finished: - if verbose: - logger.info(f"Finished request {request_id}.") - self.abort_request(request_id) - - def add_request(self, request_id: str, - **engine_add_request_kwargs) -> AsyncStream: - """Add a request to be sent to the engine on the next background - loop iteration.""" - if request_id in self._request_streams: - raise KeyError(f"Request {request_id} already exists.") - - stream = AsyncStream(request_id) - self._new_requests.put_nowait((stream, { - "request_id": request_id, - **engine_add_request_kwargs - })) - - self.new_requests_event.set() - - return stream - - def abort_request(self, request_id: str, *, verbose: bool = False) -> None: - """Abort a request during next background loop iteration.""" - if verbose: - logger.info(f"Aborted request {request_id}.") - - self._finished_requests.put_nowait(request_id) - - if request_id not in self._request_streams or self._request_streams[ - request_id].finished: - # The request has already finished or been aborted. - return - - self._request_streams[request_id].finish() - - def get_new_and_finished_requests(self) -> Tuple[List[Dict], Set[str]]: - """Get the new requests and finished requests to be - sent to the engine.""" - new_requests: List[Dict] = [] - finished_requests: Set[str] = set() - - while not self._finished_requests.empty(): - request_id = self._finished_requests.get_nowait() - finished_requests.add(request_id) - self._request_streams.pop(request_id, None) - - while not self._new_requests.empty(): - stream, new_request = self._new_requests.get_nowait() - if stream.request_id in finished_requests: - # The request has already been aborted. - stream.finish() - continue - self._request_streams[stream.request_id] = stream - new_requests.append(new_request) - - self.new_requests_event.clear() - - return new_requests, finished_requests - - async def wait_for_new_requests(self): - await self.new_requests_event.wait() - - -class _AsyncLLMEngine(LLMEngine): - """Extension of LLMEngine to add async methods.""" - - async def step_async(self) -> List[RequestOutput]: - """Performs one decoding iteration and returns newly generated results. - The workers are ran asynchronously if possible. - - This function performs one decoding iteration of the engine. It first - schedules the sequences to be executed in the next iteration and the - token blocks to be swapped in/out/copy. Then, it executes the model - and updates the scheduler with the model outputs. Finally, it decodes - the sequences and returns the newly generated results. - """ - seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule() - - if not scheduler_outputs.is_empty(): - # Execute the model. - all_outputs = await self._run_workers_async( - "execute_model", - driver_kwargs={ - "seq_group_metadata_list": seq_group_metadata_list, - "blocks_to_swap_in": scheduler_outputs.blocks_to_swap_in, - "blocks_to_swap_out": scheduler_outputs.blocks_to_swap_out, - "blocks_to_copy": scheduler_outputs.blocks_to_copy, - }) - - # Only the driver worker returns the sampling results. - output = all_outputs[0] - else: - output = [] - - return self._process_model_outputs(output, scheduler_outputs) - - async def encode_request_async( - self, - request_id: str, # pylint: disable=unused-argument - prompt: Optional[str], - prompt_token_ids: Optional[List[int]] = None, - lora_request: Optional[LoRARequest] = None, - ): - if prompt_token_ids is None: - assert prompt is not None - prompt_token_ids = await self.tokenizer.encode_async( - request_id=request_id, - prompt=prompt, - lora_request=lora_request) - return prompt_token_ids - - async def add_request_async( - self, - request_id: str, - prompt: Optional[str], - sampling_params: SamplingParams, - prompt_token_ids: Optional[List[int]] = None, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, - ) -> None: - if lora_request is not None and not self.lora_config: - raise ValueError(f"Got lora_request {lora_request} but LoRA is " - "not enabled!") - if arrival_time is None: - arrival_time = time.time() - prompt_token_ids = await self.encode_request_async( - request_id=request_id, - prompt=prompt, - prompt_token_ids=prompt_token_ids, - lora_request=lora_request) - - return self.add_request( - request_id, - prompt=prompt, - prompt_token_ids=prompt_token_ids, - sampling_params=sampling_params, - arrival_time=arrival_time, - lora_request=lora_request, - prefix_pos=prefix_pos, - ) - - async def _run_workers_async( - self, - method: str, - *args, - driver_args: Optional[List[Any]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, - **kwargs, - ) -> Any: - """Runs the given method on all workers.""" - coros = [] - - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs - - # Run the driver worker asynchronously. - driver_executor = getattr(self.driver_worker, method) - coros.append(asyncio.get_event_loop().run_in_executor( - None, partial(driver_executor, *driver_args, **driver_kwargs))) - - # Run the ray workers asynchronously. - for worker in self.workers: - coros.append(worker.execute_method.remote(method, *args, **kwargs)) - - all_outputs = await asyncio.gather(*coros) - return all_outputs - - -class AsyncLLMEngine: - """An asynchronous wrapper for LLMEngine. - - This class is used to wrap the LLMEngine class to make it asynchronous. It - uses asyncio to create a background loop that keeps processing incoming - requests. The LLMEngine is kicked by the generate method when there - are requests in the waiting queue. The generate method yields the outputs - from the LLMEngine to the caller. - - NOTE: For the comprehensive list of arguments, see `LLMEngine`. - - Args: - worker_use_ray: Whether to use Ray for model workers. Required for - distributed execution. Should be the same as - `parallel_config.worker_use_ray`. - engine_use_ray: Whether to make LLMEngine a Ray actor. If so, the - async frontend will be executed in a separate process as the - model workers. - log_requests: Whether to log the requests. - start_engine_loop: If True, the background task to run the engine - will be automatically started in the generate call. - *args: Arguments for LLMEngine. - *kwargs: Arguments for LLMEngine. - """ - - _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine - - def __init__(self, - worker_use_ray: bool, - engine_use_ray: bool, - *args, - log_requests: bool = True, - max_log_len: Optional[int] = None, - start_engine_loop: bool = True, - **kwargs) -> None: - self.worker_use_ray = worker_use_ray - self.engine_use_ray = engine_use_ray - self.log_requests = log_requests - self.max_log_len = max_log_len - self.engine = self._init_engine(*args, **kwargs) - - self.background_loop = None - # We need to keep a reference to unshielded - # task as well to prevent it from being garbage - # collected - self._background_loop_unshielded = None - self.start_engine_loop = start_engine_loop - self._request_tracker = RequestTracker() - - @property - def is_running(self) -> bool: - return (self.background_loop is not None - and not self.background_loop.done()) - - def start_background_loop(self) -> None: - """Start the background loop.""" - if self.is_running: - raise RuntimeError("Background loop is already running.") - self._request_tracker.init_event() - - self._background_loop_unshielded = asyncio.get_event_loop( - ).create_task(self.run_engine_loop()) - self._background_loop_unshielded.add_done_callback( - partial(_raise_exception_on_finish, - request_tracker=self._request_tracker)) - self.background_loop = asyncio.shield(self._background_loop_unshielded) - - def _init_engine(self, *args, - **kwargs) -> Union[_AsyncLLMEngine, "ray.ObjectRef"]: - if not self.engine_use_ray: - engine_class = self._engine_class - elif self.worker_use_ray: - engine_class = ray.remote(num_cpus=0)(self._engine_class).remote - else: - # FIXME(woosuk): This is a bit hacky. Be careful when changing the - # order of the arguments. - cache_config = args[1] - parallel_config = args[2] - if parallel_config.tensor_parallel_size == 1: - num_gpus = cache_config.gpu_memory_utilization - else: - num_gpus = 1 - engine_class = ray.remote(num_gpus=num_gpus)( - self._engine_class).remote - return engine_class(*args, **kwargs) - - async def engine_step(self) -> bool: - """Kick the engine to process the waiting requests. - - Returns True if there are in-progress requests.""" - - new_requests, finished_requests = ( - self._request_tracker.get_new_and_finished_requests()) - - for new_request in new_requests: - # Add the request into the vLLM engine's waiting queue. - # TODO: Maybe add add_request_batch to reduce Ray overhead - if self.engine_use_ray: - await self.engine.add_request.remote(**new_request) - else: - await self.engine.add_request_async(**new_request) - - if finished_requests: - await self._engine_abort(finished_requests) - - if self.engine_use_ray: - request_outputs = await self.engine.step.remote() - else: - request_outputs = await self.engine.step_async() - - # Put the outputs into the corresponding streams. - for request_output in request_outputs: - self._request_tracker.process_request_output( - request_output, verbose=self.log_requests) - - return len(request_outputs) > 0 - - async def _engine_abort(self, request_ids: Iterable[str]): - if self.engine_use_ray: - await self.engine.abort_request.remote(request_ids) - else: - self.engine.abort_request(request_ids) - - async def run_engine_loop(self): - # Initialize the RequestTracker here so it uses the right event loop. - has_requests_in_progress = False - while True: - if not has_requests_in_progress: - await self._request_tracker.wait_for_new_requests() - has_requests_in_progress = await self.engine_step() - await asyncio.sleep(0) - - async def add_request( - self, - request_id: str, - prompt: Optional[str], - sampling_params: SamplingParams, - prompt_token_ids: Optional[List[int]] = None, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, - ) -> AsyncStream: - if self.log_requests: - shortened_prompt = prompt - shortened_token_ids = prompt_token_ids - if self.max_log_len is not None: - if shortened_prompt is not None: - shortened_prompt = shortened_prompt[:self.max_log_len] - if shortened_token_ids is not None: - shortened_token_ids = shortened_token_ids[:self. - max_log_len] - logger.info(f"Received request {request_id}: " - f"prompt: {shortened_prompt!r}, " - f"prefix_pos: {prefix_pos}," - f"sampling params: {sampling_params}, " - f"prompt token ids: {shortened_token_ids}, " - f"lora_request: {lora_request}.") - - if not self.is_running: - if self.start_engine_loop: - self.start_background_loop() - else: - raise AsyncEngineDeadError( - "Background loop is not running. If it was running, " - "inspect the output to find the stacktrace of the " - "error that caused the background loop to stop " - "(AsyncEngineDeadError).") - - if arrival_time is None: - arrival_time = time.time() - prompt_token_ids = await self.engine.encode_request_async( - request_id=request_id, - prompt=prompt, - prompt_token_ids=prompt_token_ids, - lora_request=lora_request) - - stream = self._request_tracker.add_request( - request_id, - prompt=prompt, - sampling_params=sampling_params, - prompt_token_ids=prompt_token_ids, - arrival_time=arrival_time, - lora_request=lora_request, - prefix_pos=prefix_pos) - - return stream - - async def generate( - self, - prompt: Optional[str], - sampling_params: SamplingParams, - request_id: str, - prompt_token_ids: Optional[List[int]] = None, - lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, - ) -> AsyncIterator[RequestOutput]: - """Generate outputs for a request. - - Generate outputs for a request. This method is a coroutine. It adds the - request into the waiting queue of the LLMEngine and streams the outputs - from the LLMEngine to the caller. - - Args: - prompt: The prompt string. Can be None if prompt_token_ids is - provided. - sampling_params: The sampling parameters of the request. - request_id: The unique id of the request. - prompt_token_ids: The token IDs of the prompt. If None, we - use the tokenizer to convert the prompts to token IDs. - lora_request: LoRA request to use for generation, if any. - prefix_pos: If not None, we use the given position as the prefix - position for each prompt. We will cache the prefix's KV - cache and reuse it for the next request with the same prefix. - This is an experimental feature, and may be replaced with - automatic prefix caching in the future. - - Yields: - The output `RequestOutput` objects from the LLMEngine for the - request. - - Details: - - If the engine is not running, start the background loop, - which iteratively invokes - :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` - to process the waiting requests. - - Add the request to the engine's `RequestTracker`. - On the next background loop, this request will be sent to - the underlying engine. - Also, a corresponding `AsyncStream` will be created. - - Wait for the request outputs from `AsyncStream` and yield them. - - Example: - >>> # Please refer to entrypoints/api_server.py for - >>> # the complete example. - >>> - >>> # initialize the engine and the example input - >>> engine = AsyncLLMEngine.from_engine_args(engine_args) - >>> example_input = { - >>> "prompt": "What is LLM?", - >>> "stream": False, # assume the non-streaming case - >>> "temperature": 0.0, - >>> "request_id": 0, - >>> } - >>> - >>> # start the generation - >>> results_generator = engine.generate( - >>> example_input["prompt"], - >>> SamplingParams(temperature=example_input["temperature"]), - >>> example_input["request_id"]) - >>> - >>> # get the results - >>> final_output = None - >>> async for request_output in results_generator: - >>> if await request.is_disconnected(): - >>> # Abort the request if the client disconnects. - >>> await engine.abort(request_id) - >>> # Return or raise an error - >>> ... - >>> final_output = request_output - >>> - >>> # Process and return the final output - >>> ... - """ - # Preprocess the request. - # This should not be used for logging, as it is monotonic time. - arrival_time = time.monotonic() - - try: - stream = await self.add_request( - request_id, - prompt, - sampling_params, - prompt_token_ids=prompt_token_ids, - arrival_time=arrival_time, - lora_request=lora_request, - prefix_pos=prefix_pos, - ) - - async for request_output in stream: - yield request_output - except (Exception, asyncio.CancelledError) as e: - # If there is an exception or coroutine is cancelled, abort the - # request. - self._abort(request_id) - raise e - - async def abort(self, request_id: str) -> None: - """Abort a request. - - Abort a submitted request. If the request is finished or not found, - this method will be a no-op. - - Args: - request_id: The unique id of the request. - """ - if not self.is_running: - raise AsyncEngineDeadError( - "Background loop is not running. If it was running, " - "inspect the output to find the stacktrace of the " - "error that caused the background loop to stop " - "(AsyncEngineDeadError).") - - return self._abort(request_id) - - def _abort(self, request_id: str) -> None: - """Abort a request. - - Abort a submitted request. If the request is finished or not found, - this method will be a no-op. - - Args: - request_id: The unique id of the request. - """ - self._request_tracker.abort_request(request_id, - verbose=self.log_requests) - - async def get_model_config(self) -> ModelConfig: - """Get the model configuration of the vLLM engine.""" - if self.engine_use_ray: - return await self.engine.get_model_config.remote() - else: - return self.engine.get_model_config() - - @classmethod - def from_engine_args(cls, - engine_args: AsyncEngineArgs, - start_engine_loop: bool = True) -> "AsyncLLMEngine": - """Creates an async LLM engine from the engine arguments.""" - # Create the engine configs. - engine_configs = engine_args.create_engine_configs() - parallel_config = engine_configs[2] - # Initialize the cluster. - placement_group = initialize_cluster(parallel_config, - engine_args.engine_use_ray) - # Create the async LLM engine. - engine = cls(parallel_config.worker_use_ray, - engine_args.engine_use_ray, - *engine_configs, - placement_group, - log_requests=not engine_args.disable_log_requests, - log_stats=not engine_args.disable_log_stats, - max_log_len=engine_args.max_log_len, - start_engine_loop=start_engine_loop) - return engine - - async def do_log_stats(self) -> None: - if self.engine_use_ray: - await self.engine.do_log_stats.remote() - else: - self.engine.do_log_stats() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py deleted file mode 100644 index 0d836a1fb13a9f1919c5dcf07fe0caff877edd01..0000000000000000000000000000000000000000 --- a/vllm/engine/llm_engine.py +++ /dev/null @@ -1,990 +0,0 @@ -import copy -from collections import defaultdict -import os -import time -from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, - Union) - -from vllm.lora.request import LoRARequest -from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, - SchedulerConfig, LoRAConfig) -from vllm.core.scheduler import Scheduler, SchedulerOutputs -from vllm.engine.arg_utils import EngineArgs -from vllm.engine.metrics import record_metrics -from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray -from vllm.logger import init_logger -from vllm.outputs import RequestOutput -from vllm.sampling_params import SamplingParams -from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup, - SequenceGroupOutput, SequenceOutput, SequenceStatus) -from vllm.transformers_utils.tokenizer import (detokenize_incrementally, - TokenizerGroup) -from vllm.utils import Counter, set_cuda_visible_devices, get_ip, get_open_port, get_distributed_init_method - -if ray: - from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - -if TYPE_CHECKING: - from ray.util.placement_group import PlacementGroup - -logger = init_logger(__name__) - -_LOGGING_INTERVAL_SEC = 5 - - -class LLMEngine: - """An LLM engine that receives requests and generates texts. - - This is the main class for the vLLM engine. It receives requests - from clients and generates texts from the LLM. It includes a tokenizer, a - language model (possibly distributed across multiple GPUs), and GPU memory - space allocated for intermediate states (aka KV cache). This class utilizes - iteration-level scheduling and efficient memory management to maximize the - serving throughput. - - The `LLM` class wraps this class for offline batched inference and the - `AsyncLLMEngine` class wraps this class for online serving. - - NOTE: The config arguments are derived from the `EngineArgs` class. For the - comprehensive list of arguments, see `EngineArgs`. - - Args: - model_config: The configuration related to the LLM model. - cache_config: The configuration related to the KV cache memory - management. - parallel_config: The configuration related to distributed execution. - scheduler_config: The configuration related to the request scheduler. - placement_group: Ray placement group for distributed execution. - Required for distributed execution. - log_stats: Whether to log statistics. - """ - - def __init__( - self, - model_config: ModelConfig, - cache_config: CacheConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - lora_config: Optional[LoRAConfig], - placement_group: Optional["PlacementGroup"], - log_stats: bool, - ) -> None: - logger.info( - "Initializing an LLM engine with config: " - f"model={model_config.model!r}, " - f"tokenizer={model_config.tokenizer!r}, " - f"tokenizer_mode={model_config.tokenizer_mode}, " - f"revision={model_config.revision}, " - f"tokenizer_revision={model_config.tokenizer_revision}, " - f"trust_remote_code={model_config.trust_remote_code}, " - f"dtype={model_config.dtype}, " - f"max_seq_len={model_config.max_model_len}, " - f"download_dir={model_config.download_dir!r}, " - f"load_format={model_config.load_format}, " - f"tensor_parallel_size={parallel_config.tensor_parallel_size}, " - f"disable_custom_all_reduce={parallel_config.disable_custom_all_reduce}, " - f"quantization={model_config.quantization}, " - f"enforce_eager={model_config.enforce_eager}, " - f"kv_cache_dtype={cache_config.cache_dtype}, " - f"seed={model_config.seed})") - # TODO(woosuk): Print more configs in debug mode. - - self.model_config = model_config - self.cache_config = cache_config - self.lora_config = lora_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.log_stats = log_stats - self._verify_args() - - self._init_tokenizer() - self.seq_counter = Counter() - - # Create the parallel GPU workers. - if self.parallel_config.worker_use_ray: - # Disable Ray usage stats collection. - ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") - if ray_usage != "1": - os.environ["RAY_USAGE_STATS_ENABLED"] = "0" - self._init_workers_ray(placement_group) - else: - self._init_workers() - - # Profile the memory usage and initialize the cache. - self._init_cache() - - # Create the scheduler. - self.scheduler = Scheduler(scheduler_config, cache_config, lora_config) - - # Logging. - self.last_logging_time = 0.0 - # List of (timestamp, num_tokens) - self.num_prompt_tokens: List[Tuple[float, int]] = [] - # List of (timestamp, num_tokens) - self.num_generation_tokens: List[Tuple[float, int]] = [] - - def get_tokenizer_for_seq(self, sequence: Sequence): - return self.tokenizer.get_lora_tokenizer(sequence.lora_request) - - def _init_workers(self): - # Lazy import the Worker to avoid importing torch.cuda/xformers - # before CUDA_VISIBLE_DEVICES is set in the Worker - from vllm.worker.worker import Worker - - assert self.parallel_config.world_size == 1, ( - "Ray is required if parallel_config.world_size > 1.") - - self.workers: List[Worker] = [] - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - self.driver_worker = Worker( - self.model_config, - self.parallel_config, - self.scheduler_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - lora_config=self.lora_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=True, - ) - self._run_workers("init_model") - self._run_workers("load_model") - - def _init_tokenizer(self, **tokenizer_init_kwargs): - init_kwargs = dict( - enable_lora=bool(self.lora_config), - max_num_seqs=self.scheduler_config.max_num_seqs, - max_input_length=None, - tokenizer_mode=self.model_config.tokenizer_mode, - trust_remote_code=self.model_config.trust_remote_code, - revision=self.model_config.tokenizer_revision) - init_kwargs.update(tokenizer_init_kwargs) - self.tokenizer: TokenizerGroup = TokenizerGroup( - self.model_config.tokenizer, **init_kwargs) - - def _init_workers_ray(self, placement_group: "PlacementGroup", - **ray_remote_kwargs): - if self.parallel_config.tensor_parallel_size == 1: - num_gpus = self.cache_config.gpu_memory_utilization - else: - num_gpus = 1 - - self.driver_dummy_worker: RayWorkerVllm = None - self.workers: List[RayWorkerVllm] = [] - - driver_ip = get_ip() - for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get("GPU", 0): - continue - scheduling_strategy = PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_capture_child_tasks=True, - placement_group_bundle_index=bundle_id, - ) - worker = ray.remote( - num_cpus=0, - num_gpus=num_gpus, - scheduling_strategy=scheduling_strategy, - **ray_remote_kwargs, - )(RayWorkerVllm).remote(self.model_config.trust_remote_code) - - worker_ip = ray.get(worker.get_node_ip.remote()) - if worker_ip == driver_ip and self.driver_dummy_worker is None: - # If the worker is on the same node as the driver, we use it - # as the resource holder for the driver process. - self.driver_dummy_worker = worker - else: - self.workers.append(worker) - - if self.driver_dummy_worker is None: - raise ValueError( - "Ray does not allocate any GPUs on the driver node. Consider " - "adjusting the Ray placement group or running the driver on a " - "GPU node.") - - driver_node_id, driver_gpu_ids = ray.get( - self.driver_dummy_worker.get_node_and_gpu_ids.remote()) - worker_node_and_gpu_ids = ray.get( - [worker.get_node_and_gpu_ids.remote() for worker in self.workers]) - - node_workers = defaultdict(list) - node_gpus = defaultdict(list) - - node_workers[driver_node_id].append(0) - node_gpus[driver_node_id].extend(driver_gpu_ids) - for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids, - start=1): - node_workers[node_id].append(i) - node_gpus[node_id].extend(gpu_ids) - for node_id, gpu_ids in node_gpus.items(): - node_gpus[node_id] = sorted(gpu_ids) - - # Set CUDA_VISIBLE_DEVICES for the driver. - set_cuda_visible_devices(node_gpus[driver_node_id]) - for worker, (node_id, _) in zip(self.workers, worker_node_and_gpu_ids): - worker.set_cuda_visible_devices.remote(node_gpus[node_id]) - - distributed_init_method = get_distributed_init_method( - driver_ip, get_open_port()) - - # Lazy import the Worker to avoid importing torch.cuda/xformers - # before CUDA_VISIBLE_DEVICES is set in the Worker - from vllm.worker.worker import Worker - - # Initialize torch distributed process group for the workers. - model_config = copy.deepcopy(self.model_config) - parallel_config = copy.deepcopy(self.parallel_config) - scheduler_config = copy.deepcopy(self.scheduler_config) - - for rank, (worker, (node_id, - _)) in enumerate(zip(self.workers, - worker_node_and_gpu_ids), - start=1): - local_rank = node_workers[node_id].index(rank) - worker.init_worker.remote( - lambda rank=rank, local_rank=local_rank: Worker( - model_config, - parallel_config, - scheduler_config, - local_rank, - rank, - distributed_init_method, - lora_config=self.lora_config, - kv_cache_dtype=self.cache_config.cache_dtype, - )) - - driver_rank = 0 - driver_local_rank = node_workers[driver_node_id].index(driver_rank) - self.driver_worker = Worker( - model_config, - parallel_config, - scheduler_config, - driver_local_rank, - driver_rank, - distributed_init_method, - lora_config=self.lora_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=True, - ) - - self._run_workers("init_model") - self._run_workers( - "load_model", - max_concurrent_workers=self.parallel_config. - max_parallel_loading_workers, - ) - - def _verify_args(self) -> None: - self.model_config.verify_with_parallel_config(self.parallel_config) - self.cache_config.verify_with_parallel_config(self.parallel_config) - if self.lora_config: - self.lora_config.verify_with_model_config(self.model_config) - self.lora_config.verify_with_scheduler_config( - self.scheduler_config) - - def _init_cache(self) -> None: - """Profiles the memory usage and initializes the KV cache. - - The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. - More details can be found in the - :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method - from class :class:`~vllm.worker.Worker`. - - Afterwards, as there may be multiple workers, - we take the minimum number of blocks across all workers - to ensure this can be applied to all of them. - - Finally, the engine will initialize the KV cache - with the calculated number of blocks. - - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameters. - """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_blocks = self._run_workers( - "profile_num_available_blocks", - block_size=self.cache_config.block_size, - gpu_memory_utilization=self.cache_config.gpu_memory_utilization, - cpu_swap_space=self.cache_config.swap_space_bytes, - cache_dtype=self.cache_config.cache_dtype, - ) - - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - # FIXME(woosuk): Change to debug log. - logger.info(f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") - - if num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - max_seq_len = self.cache_config.block_size * num_gpu_blocks - if self.model_config.max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({self.model_config.max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - # Initialize the cache. - self._run_workers("init_cache_engine", cache_config=self.cache_config) - # Warm up the model. This includes capturing the model into CUDA graph - # if enforce_eager is False. - self._run_workers("warm_up_model") - - @classmethod - def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine": - """Creates an LLM engine from the engine arguments.""" - # Create the engine configs. - engine_configs = engine_args.create_engine_configs() - parallel_config = engine_configs[2] - # Initialize the cluster. - placement_group = initialize_cluster(parallel_config) - # Create the LLM engine. - engine = cls(*engine_configs, - placement_group, - log_stats=not engine_args.disable_log_stats) - return engine - - def encode_request( - self, - request_id: str, # pylint: disable=unused-argument - prompt: Optional[str], - prompt_token_ids: Optional[List[int]] = None, - lora_request: Optional[LoRARequest] = None, - ): - if prompt_token_ids is None: - assert prompt is not None - prompt_token_ids = self.tokenizer.encode(request_id=request_id, - prompt=prompt, - lora_request=lora_request) - return prompt_token_ids - - def add_request( - self, - request_id: str, - prompt: Optional[str], - sampling_params: SamplingParams, - prompt_token_ids: Optional[List[int]] = None, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, - ) -> None: - """Add a request to the engine's request pool. - - The request is added to the request pool and will be processed by the - scheduler as `engine.step()` is called. The exact scheduling policy is - determined by the scheduler. - - Args: - request_id: The unique ID of the request. - prompt: The prompt string. Can be None if prompt_token_ids is - provided. - sampling_params: The sampling parameters for text generation. - prompt_token_ids: The token IDs of the prompt. If None, we - use the tokenizer to convert the prompts to token IDs. - arrival_time: The arrival time of the request. If None, we use - the current monotonic time. - prefix_pos: If not None, we use the given position as the prefix - position for each prompt. We will cache the prefix's KV - cache and reuse it for the next request with the same prefix. - This is an experimental feature, and may be replaced with - automatic prefix caching in the future. - - Details: - - Set arrival_time to the current time if it is None. - - Set prompt_token_ids to the encoded prompt if it is None. - - Create `best_of` number of :class:`~vllm.Sequence` objects. - - Create a :class:`~vllm.SequenceGroup` object - from the list of :class:`~vllm.Sequence`. - - Add the :class:`~vllm.SequenceGroup` object to the scheduler. - - Example: - >>> # initialize engine - >>> engine = LLMEngine.from_engine_args(engine_args) - >>> # set request arguments - >>> example_prompt = "Who is the president of the United States?" - >>> sampling_params = SamplingParams(temperature=0.0) - >>> request_id = 0 - >>> - >>> # add the request to the engine - >>> engine.add_request( - >>> str(request_id), - >>> example_prompt, - >>> SamplingParams(temperature=0.0)) - >>> # continue the request processing - >>> ... - """ - if lora_request is not None and not self.lora_config: - raise ValueError(f"Got lora_request {lora_request} but LoRA is " - "not enabled!") - if arrival_time is None: - arrival_time = time.monotonic() - prompt_token_ids = self.encode_request( - request_id=request_id, - prompt=prompt, - prompt_token_ids=prompt_token_ids, - lora_request=lora_request) - - # Create the sequences. - block_size = self.cache_config.block_size - seq_id = next(self.seq_counter) - seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, - lora_request) - - # Check whether the input specifies prefix - prefix = self.scheduler.prefix_pool.add_or_get_prefix( - prompt_token_ids[:prefix_pos], lora_request.lora_int_id - if lora_request else 0) if prefix_pos is not None else None - - # Create the sequence group. - seq_group = SequenceGroup(request_id, [seq], sampling_params, - arrival_time, lora_request, prefix) - - # Add the sequence group to the scheduler. - self.scheduler.add_seq_group(seq_group) - - def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: - """Aborts a request(s) with the given ID. - - Args: - request_id: The ID(s) of the request to abort. - - Details: - - Refer to the - :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group` - from class :class:`~vllm.core.scheduler.Scheduler`. - - Example: - >>> # initialize engine and add a request with request_id - >>> request_id = str(0) - >>> # abort the request - >>> engine.abort_request(request_id) - """ - self.scheduler.abort_seq_group(request_id) - - def get_model_config(self) -> ModelConfig: - """Gets the model configuration.""" - return self.model_config - - def get_num_unfinished_requests(self) -> int: - """Gets the number of unfinished requests.""" - return self.scheduler.get_num_unfinished_seq_groups() - - def has_unfinished_requests(self) -> bool: - """Returns True if there are unfinished requests.""" - return self.scheduler.has_unfinished_seqs() - - def _check_beam_search_early_stopping( - self, - early_stopping: Union[bool, str], - sampling_params: SamplingParams, - best_running_seq: Sequence, - current_worst_seq: Sequence, - ) -> bool: - assert sampling_params.use_beam_search - length_penalty = sampling_params.length_penalty - if early_stopping is True: - return True - - current_worst_score = (current_worst_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq( - current_worst_seq).eos_token_id)) - if early_stopping is False: - highest_attainable_score = (best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq( - best_running_seq).eos_token_id)) - else: - assert early_stopping == "never" - if length_penalty > 0.0: - # If length_penalty > 0.0, beam search will prefer longer - # sequences. The highest attainable score calculation is - # based on the longest possible sequence length in this case. - max_possible_length = max( - best_running_seq.get_prompt_len() + - sampling_params.max_tokens, - self.scheduler_config.max_model_len) - highest_attainable_score = ( - best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq( - best_running_seq).eos_token_id, - seq_len=max_possible_length)) - else: - # Otherwise, beam search will prefer shorter sequences. The - # highest attainable score calculation is based on the current - # sequence length. - highest_attainable_score = ( - best_running_seq.get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq( - best_running_seq).eos_token_id)) - return current_worst_score >= highest_attainable_score - - def _process_sequence_group_outputs(self, seq_group: SequenceGroup, - outputs: SequenceGroupOutput) -> None: - # Process prompt logprobs - prompt_logprobs = outputs.prompt_logprobs - if prompt_logprobs is not None: - seq_group.prompt_logprobs = prompt_logprobs - - # Process samples - samples = outputs.samples - parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - existing_finished_seqs = seq_group.get_finished_seqs() - parent_child_dict = { - parent_seq.seq_id: [] - for parent_seq in parent_seqs - } - for sample in samples: - parent_child_dict[sample.parent_seq_id].append(sample) - # List of (child, parent) - child_seqs: List[Tuple[Sequence, Sequence]] = [] - - # Process the child samples for each parent sequence - for parent in parent_seqs: - child_samples: List[SequenceOutput] = parent_child_dict[ - parent.seq_id] - if len(child_samples) == 0: - # This parent sequence has no children samples. Remove - # the parent sequence from the sequence group since it will - # not be used in the future iterations. - parent.status = SequenceStatus.FINISHED_ABORTED - seq_group.remove(parent.seq_id) - self.scheduler.free_seq(parent) - continue - # Fork the parent sequence if there are multiple child samples. - for child_sample in child_samples[:-1]: - new_child_seq_id = next(self.seq_counter) - child = parent.fork(new_child_seq_id) - child.append_token_id(child_sample.output_token, - child_sample.logprobs) - child_seqs.append((child, parent)) - # Continue the parent sequence for the last child sample. - # We reuse the parent sequence here to reduce redundant memory - # copies, especially when using non-beam search sampling methods. - last_child_sample = child_samples[-1] - parent.append_token_id(last_child_sample.output_token, - last_child_sample.logprobs) - child_seqs.append((parent, parent)) - - for seq, _ in child_seqs: - self._decode_sequence(seq, seq_group.sampling_params) - self._check_stop(seq, seq_group.sampling_params) - - # Non-beam search case - if not seq_group.sampling_params.use_beam_search: - # For newly created child sequences, add them to the sequence group - # and fork them in block manager if they are not finished. - for seq, parent in child_seqs: - if seq is not parent: - seq_group.add(seq) - if not seq.is_finished(): - self.scheduler.fork_seq(parent, seq) - - # Free the finished and selected parent sequences' memory in block - # manager. Keep them in the sequence group as candidate output. - # NOTE: we need to fork the new sequences before freeing the - # old sequences. - for seq, parent in child_seqs: - if seq is parent and seq.is_finished(): - self.scheduler.free_seq(seq) - return - - # Beam search case - # Select the child sequences to keep in the sequence group. - selected_child_seqs = [] - unselected_child_seqs = [] - beam_width = seq_group.sampling_params.best_of - length_penalty = seq_group.sampling_params.length_penalty - - # Select the newly finished sequences with the highest scores - # to replace existing finished sequences. - # Tuple of (seq, parent, is_new) - existing_finished_seqs = [(seq, None, False) - for seq in existing_finished_seqs] - new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs - if seq.is_finished()] - all_finished_seqs = existing_finished_seqs + new_finished_seqs - # Sort the finished sequences by their scores. - all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id), - reverse=True) - for seq, parent, is_new in all_finished_seqs[:beam_width]: - if is_new: - # A newly generated child sequence finishes and has a high - # score, so we will add it into the sequence group. - selected_child_seqs.append((seq, parent)) - for seq, parent, is_new in all_finished_seqs[beam_width:]: - if is_new: - # A newly generated child sequence finishes but has a low - # score, so we will not add it into the sequence group. - # Additionally, if this sequence is a continuation of a - # parent sequence, we will need remove the parent sequence - # from the sequence group. - unselected_child_seqs.append((seq, parent)) - else: - # An existing finished sequence has a low score, so we will - # remove it from the sequence group. - seq_group.remove(seq.seq_id) - - # select the top beam_width sequences from the running - # sequences for the next iteration to continue the beam - # search. - running_child_seqs = [(seq, parent) for seq, parent in child_seqs - if not seq.is_finished()] - # Sort the running sequences by their scores. - running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id), - reverse=True) - - # Check if we can stop the beam search. - if len(running_child_seqs) == 0: - # No running sequences, stop the beam search. - stop_beam_search = True - elif len(all_finished_seqs) < beam_width: - # Not enough finished sequences, continue the beam search. - stop_beam_search = False - else: - # Check the early stopping criteria - best_running_seq = running_child_seqs[0][0] - current_worst_seq = all_finished_seqs[beam_width - 1][0] - stop_beam_search = self._check_beam_search_early_stopping( - seq_group.sampling_params.early_stopping, - seq_group.sampling_params, best_running_seq, current_worst_seq) - - if stop_beam_search: - # Stop the beam search and remove all the running sequences from - # the sequence group. - unselected_child_seqs.extend(running_child_seqs) - else: - # Continue the beam search and select the top beam_width sequences - # to continue the beam search. - selected_child_seqs.extend(running_child_seqs[:beam_width]) - # The remaining running sequences will not be used in the next - # iteration. Again, if these sequences are continuations of - # parent sequences, we will need to remove the parent sequences - # from the sequence group. - unselected_child_seqs.extend(running_child_seqs[beam_width:]) - - # For newly created child sequences, add them to the sequence group - # and fork them in block manager if they are not finished. - for seq, parent in selected_child_seqs: - if seq is not parent: - seq_group.add(seq) - if not seq.is_finished(): - self.scheduler.fork_seq(parent, seq) - - # Free the finished and selected parent sequences' memory in block - # manager. Keep them in the sequence group as candidate output. - for seq, parent in selected_child_seqs: - if seq is parent and seq.is_finished(): - self.scheduler.free_seq(seq) - - # Remove the unselected parent sequences from the sequence group and - # free their memory in block manager. - for seq, parent in unselected_child_seqs: - if seq is parent: - # Remove the parent sequence if it is not selected for next - # iteration - seq_group.remove(seq.seq_id) - self.scheduler.free_seq(seq) - - def _process_model_outputs( - self, output: SamplerOutput, - scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]: - # Update the scheduled sequence groups with the model outputs. - scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups - for seq_group, outputs in zip(scheduled_seq_groups, output): - self._process_sequence_group_outputs(seq_group, outputs) - - # Free the finished sequence groups. - self.scheduler.free_finished_seq_groups() - - # Create the outputs. - request_outputs: List[RequestOutput] = [] - for seq_group in scheduled_seq_groups: - request_output = RequestOutput.from_seq_group(seq_group) - request_outputs.append(request_output) - for seq_group in scheduler_outputs.ignored_seq_groups: - request_output = RequestOutput.from_seq_group(seq_group) - request_outputs.append(request_output) - - # Update prefix state, now all the uncomputed prefixes are computed. - for seq_group in scheduled_seq_groups: - if (seq_group.prefix is not None and seq_group.prefix.allocated - and not seq_group.prefix.computed): - seq_group.prefix.computed = True - - if self.log_stats: - # Log the system stats. - self._log_system_stats(scheduler_outputs.prompt_run, - scheduler_outputs.num_batched_tokens) - return request_outputs - - def step(self) -> List[RequestOutput]: - """Performs one decoding iteration and returns newly generated results. - - .. figure:: https://i.imgur.com/sv2HssD.png - :alt: Overview of the step function - :align: center - - Overview of the step function. - - Details: - - Step 1: Schedules the sequences to be executed in the next - iteration and the token blocks to be swapped in/out/copy. - - - Depending on the scheduling policy, - sequences may be `preempted/reordered`. - - A Sequence Group (SG) refer to a group of sequences - that are generated from the same prompt. - - - Step 2: Calls the workers to execute the model. - - Step 3: Processes the model output. This mainly includes: - - - Decodes the relevant outputs. - - Updates the scheduled sequence groups with model outputs - based on its `sampling parameters` (`use_beam_search` or not). - - Frees the finished sequence groups. - - - Finally, it creates and returns the newly generated results. - - Example: - >>> # Please see the example/ folder for more detailed examples. - >>> - >>> # initialize engine and request arguments - >>> engine = LLMEngine.from_engine_args(engine_args) - >>> example_inputs = [(0, "What is LLM?", - >>> SamplingParams(temperature=0.0))] - >>> - >>> # Start the engine with an event loop - >>> while True: - >>> if example_inputs: - >>> req_id, prompt, sampling_params = example_inputs.pop(0) - >>> engine.add_request(str(req_id), prompt, sampling_params) - >>> - >>> # continue the request processing - >>> request_outputs = engine.step() - >>> for request_output in request_outputs: - >>> if request_output.finished: - >>> # return or show the request output - >>> - >>> if not (engine.has_unfinished_requests() or example_inputs): - >>> break - """ - seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule() - - if not scheduler_outputs.is_empty(): - # Execute the model. - all_outputs = self._run_workers( - "execute_model", - driver_kwargs={ - "seq_group_metadata_list": seq_group_metadata_list, - "blocks_to_swap_in": scheduler_outputs.blocks_to_swap_in, - "blocks_to_swap_out": scheduler_outputs.blocks_to_swap_out, - "blocks_to_copy": scheduler_outputs.blocks_to_copy, - }) - - # Only the driver worker returns the sampling results. - output = all_outputs[0] - else: - output = [] - - return self._process_model_outputs(output, scheduler_outputs) - - def do_log_stats(self) -> None: - self._log_system_stats(False, 0) - - def _log_system_stats( - self, - prompt_run: bool, - num_batched_tokens: int, - ) -> None: - now = time.monotonic() - # Log the number of batched input tokens. - if prompt_run: - self.num_prompt_tokens.append((now, num_batched_tokens)) - else: - self.num_generation_tokens.append((now, num_batched_tokens)) - - should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC - if not should_log: - return - - # Discard the old stats. - self.num_prompt_tokens = [(t, n) for t, n in self.num_prompt_tokens - if now - t < _LOGGING_INTERVAL_SEC] - self.num_generation_tokens = [(t, n) - for t, n in self.num_generation_tokens - if now - t < _LOGGING_INTERVAL_SEC] - - if len(self.num_prompt_tokens) > 1: - total_num_tokens = sum(n for _, n in self.num_prompt_tokens[:-1]) - window = now - self.num_prompt_tokens[0][0] - avg_prompt_throughput = total_num_tokens / window - else: - avg_prompt_throughput = 0.0 - if len(self.num_generation_tokens) > 1: - total_num_tokens = sum(n - for _, n in self.num_generation_tokens[:-1]) - window = now - self.num_generation_tokens[0][0] - avg_generation_throughput = total_num_tokens / window - else: - avg_generation_throughput = 0.0 - - total_num_gpu_blocks = self.cache_config.num_gpu_blocks - num_free_gpu_blocks = ( - self.scheduler.block_manager.get_num_free_gpu_blocks()) - num_used_gpu_blocks = total_num_gpu_blocks - num_free_gpu_blocks - gpu_cache_usage = num_used_gpu_blocks / total_num_gpu_blocks - - total_num_cpu_blocks = self.cache_config.num_cpu_blocks - if total_num_cpu_blocks > 0: - num_free_cpu_blocks = ( - self.scheduler.block_manager.get_num_free_cpu_blocks()) - num_used_cpu_blocks = total_num_cpu_blocks - num_free_cpu_blocks - cpu_cache_usage = num_used_cpu_blocks / total_num_cpu_blocks - else: - cpu_cache_usage = 0.0 - - record_metrics( - avg_prompt_throughput=avg_prompt_throughput, - avg_generation_throughput=avg_generation_throughput, - scheduler_running=len(self.scheduler.running), - scheduler_swapped=len(self.scheduler.swapped), - scheduler_waiting=len(self.scheduler.waiting), - gpu_cache_usage=gpu_cache_usage, - cpu_cache_usage=cpu_cache_usage, - ) - - logger.info("Avg prompt throughput: " - f"{avg_prompt_throughput:.1f} tokens/s, " - "Avg generation throughput: " - f"{avg_generation_throughput:.1f} tokens/s, " - f"Running: {len(self.scheduler.running)} reqs, " - f"Swapped: {len(self.scheduler.swapped)} reqs, " - f"Pending: {len(self.scheduler.waiting)} reqs, " - f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, " - f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%") - self.last_logging_time = now - - def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None: - """Decodes the new token for a sequence.""" - (new_tokens, new_output_text, prefix_offset, - read_offset) = detokenize_incrementally( - self.get_tokenizer_for_seq(seq), - all_input_ids=seq.get_token_ids(), - prev_tokens=seq.tokens, - prefix_offset=seq.prefix_offset, - read_offset=seq.read_offset, - skip_special_tokens=prms.skip_special_tokens, - spaces_between_special_tokens=prms.spaces_between_special_tokens, - ) - if seq.tokens is None: - seq.tokens = new_tokens - else: - seq.tokens.extend(new_tokens) - seq.prefix_offset = prefix_offset - seq.read_offset = read_offset - seq.output_text += new_output_text - - def _check_stop(self, seq: Sequence, - sampling_params: SamplingParams) -> None: - """Stop the finished sequences.""" - for stop_str in sampling_params.stop: - if seq.output_text.endswith(stop_str): - if not sampling_params.include_stop_str_in_output: - # Truncate the output text so that the stop string is - # not included in the output. - seq.output_text = seq.output_text[:-len(stop_str)] - seq.status = SequenceStatus.FINISHED_STOPPED - return - if seq.get_last_token_id() in sampling_params.stop_token_ids: - seq.status = SequenceStatus.FINISHED_STOPPED - return - - # Check if the sequence has reached max_model_len. - if seq.get_len() > self.scheduler_config.max_model_len: - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the sequence has reached max_tokens. - if seq.get_output_len() == sampling_params.max_tokens: - seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED - return - - # Check if the sequence has generated the EOS token. - if ((not sampling_params.ignore_eos) and seq.get_last_token_id() - == self.get_tokenizer_for_seq(seq).eos_token_id): - seq.status = SequenceStatus.FINISHED_STOPPED - return - - def add_lora(self, lora_request: LoRARequest) -> bool: - assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." - return self._run_workers( - "add_lora", - lora_request=lora_request, - ) - - def remove_lora(self, lora_id: int) -> bool: - assert lora_id > 0, "lora_id must be greater than 0." - return self._run_workers( - "remove_lora", - lora_id=lora_id, - ) - - def list_loras(self) -> List[int]: - return self._run_workers("list_loras") - - def _run_workers( - self, - method: str, - *args, - driver_args: Optional[List[Any]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, - max_concurrent_workers: Optional[int] = None, - **kwargs, - ) -> Any: - """Runs the given method on all workers.""" - - if max_concurrent_workers: - raise NotImplementedError( - "max_concurrent_workers is not supported yet.") - - # Start the ray workers first. - ray_worker_outputs = [ - worker.execute_method.remote(method, *args, **kwargs) - for worker in self.workers - ] - - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs - - # Start the driver worker after all the ray workers. - driver_worker_output = getattr(self.driver_worker, - method)(*driver_args, **driver_kwargs) - - # Get the results of the ray workers. - if self.workers: - ray_worker_outputs = ray.get(ray_worker_outputs) - - return [driver_worker_output] + ray_worker_outputs diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py deleted file mode 100644 index c64071207f6a032cd8048a00a0835c95942b1403..0000000000000000000000000000000000000000 --- a/vllm/engine/metrics.py +++ /dev/null @@ -1,51 +0,0 @@ -from aioprometheus import Gauge - -# The begin-* and end* here are used by the documentation generator -# to extract the metrics definitions. - -# begin-metrics-definitions -gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s", - "Average prefill throughput in tokens/s.") -gauge_avg_generation_throughput = Gauge( - "vllm:avg_generation_throughput_toks_per_s", - "Average generation throughput in tokens/s.") - -gauge_scheduler_running = Gauge( - "vllm:num_requests_running", - "Number of requests that is currently running for inference.") -gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped", - "Number requests swapped to CPU.") -gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting", - "Number of requests waiting to be processed.") - -gauge_gpu_cache_usage = Gauge( - "vllm:gpu_cache_usage_perc", - "GPU KV-cache usage. 1 means 100 percent usage.") -gauge_cpu_cache_usage = Gauge( - "vllm:cpu_cache_usage_perc", - "CPU KV-cache usage. 1 means 100 percent usage.") -# end-metrics-definitions - -labels = {} - - -def add_global_metrics_labels(**kwargs): - labels.update(kwargs) - - -def record_metrics( - avg_prompt_throughput: float, - avg_generation_throughput: float, - scheduler_running: int, - scheduler_swapped: int, - scheduler_waiting: int, - gpu_cache_usage: float, - cpu_cache_usage: float, -): - gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput) - gauge_avg_generation_throughput.set(labels, avg_generation_throughput) - gauge_scheduler_running.set(labels, scheduler_running) - gauge_scheduler_swapped.set(labels, scheduler_swapped) - gauge_scheduler_waiting.set(labels, scheduler_waiting) - gauge_gpu_cache_usage.set(labels, gpu_cache_usage) - gauge_cpu_cache_usage.set(labels, cpu_cache_usage) diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py deleted file mode 100644 index afbc33ed19a0c8b235b8a2a8d36b2fec5c407453..0000000000000000000000000000000000000000 --- a/vllm/engine/ray_utils.py +++ /dev/null @@ -1,123 +0,0 @@ -from typing import Optional, List, Tuple, TYPE_CHECKING - -from vllm.config import ParallelConfig -from vllm.logger import init_logger -from vllm.utils import is_hip, set_cuda_visible_devices, get_ip - -logger = init_logger(__name__) - -try: - import ray - - class RayWorkerVllm: - """Ray wrapper for vllm.worker.Worker, allowing Worker to be - lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES.""" - - def __init__(self, init_cached_hf_modules=False) -> None: - if init_cached_hf_modules: - from transformers.dynamic_module_utils import init_hf_modules - init_hf_modules() - self.worker = None - - def init_worker(self, worker_init_fn): - self.worker = worker_init_fn() - - def __getattr__(self, name): - return getattr(self.worker, name) - - def execute_method(self, method, *args, **kwargs): - executor = getattr(self, method) - return executor(*args, **kwargs) - - def get_node_ip(self) -> str: - return get_ip() - - def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: - node_id = ray.get_runtime_context().get_node_id() - gpu_ids = ray.get_gpu_ids() - return node_id, gpu_ids - - def set_cuda_visible_devices(self, device_ids) -> None: - set_cuda_visible_devices(device_ids) - -except ImportError as e: - logger.warning(f"Failed to import Ray with {e!r}. " - "For distributed inference, please install Ray with " - "`pip install ray`.") - ray = None - RayWorkerVllm = None - -if TYPE_CHECKING: - from ray.util.placement_group import PlacementGroup - - -def initialize_cluster( - parallel_config: ParallelConfig, - engine_use_ray: bool = False, - ray_address: Optional[str] = None, -) -> Optional["PlacementGroup"]: - """Initialize the distributed cluster probably with Ray. - - Args: - parallel_config: The configurations for parallel execution. - engine_use_ray: Whether to use Ray for async engine. - ray_address: The address of the Ray cluster. If None, uses - the default Ray cluster address. - - Returns: - An optional `PlacementGroup`. It includes the specification - of the resources for each distributed worker. None if Ray is - not used. - """ - if parallel_config.worker_use_ray or engine_use_ray: - if ray is None: - raise ImportError( - "Ray is not installed. Please install Ray to use distributed " - "serving.") - # Connect to a ray cluster. - if is_hip(): - ray.init(address=ray_address, - ignore_reinit_error=True, - num_gpus=parallel_config.world_size) - else: - ray.init(address=ray_address, ignore_reinit_error=True) - - if not parallel_config.worker_use_ray: - assert parallel_config.world_size == 1, ( - "Ray is required if parallel_config.world_size > 1.") - return None - - # Create placement group for worker processes - current_placement_group = ray.util.get_current_placement_group() - if current_placement_group: - # We are in a placement group - bundles = current_placement_group.bundle_specs - # Verify that we can use the placement group. - gpu_bundles = 0 - for bundle in bundles: - bundle_gpus = bundle.get("GPU", 0) - if bundle_gpus > 1: - raise ValueError( - "Placement group bundle cannot have more than 1 GPU.") - if bundle_gpus: - gpu_bundles += 1 - if parallel_config.world_size > gpu_bundles: - raise ValueError( - "The number of required GPUs exceeds the total number of " - "available GPUs in the placement group.") - else: - num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0) - if parallel_config.world_size > num_gpus_in_cluster: - raise ValueError( - "The number of required GPUs exceeds the total number of " - "available GPUs in the cluster.") - # Create a new placement group - placement_group_specs = ([{"GPU": 1}] * parallel_config.world_size) - current_placement_group = ray.util.placement_group( - placement_group_specs) - # Wait until PG is ready - this will block until all - # requested resources are available, and will timeout - # if they cannot be provisioned. - ray.get(current_placement_group.ready(), timeout=1800) - - return current_placement_group diff --git a/vllm/entrypoints/__init__.py b/vllm/entrypoints/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py deleted file mode 100644 index f7b8d258fae4c4044b514974d62126c2f15c000f..0000000000000000000000000000000000000000 --- a/vllm/entrypoints/api_server.py +++ /dev/null @@ -1,99 +0,0 @@ -import argparse -import json -from typing import AsyncGenerator - -from fastapi import FastAPI, Request -from fastapi.responses import JSONResponse, Response, StreamingResponse -import uvicorn - -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.sampling_params import SamplingParams -from vllm.utils import random_uuid - -TIMEOUT_KEEP_ALIVE = 5 # seconds. -app = FastAPI() -engine = None - - -@app.get("/health") -async def health() -> Response: - """Health check.""" - return Response(status_code=200) - - -@app.post("/generate") -async def generate(request: Request) -> Response: - """Generate completion for the request. - - The request should be a JSON object with the following fields: - - prompt: the prompt to use for the generation. - - stream: whether to stream the results or not. - - other fields: the sampling parameters (See `SamplingParams` for details). - """ - request_dict = await request.json() - prompt = request_dict.pop("prompt") - prefix_pos = request_dict.pop("prefix_pos", None) - stream = request_dict.pop("stream", False) - sampling_params = SamplingParams(**request_dict) - request_id = random_uuid() - - results_generator = engine.generate(prompt, - sampling_params, - request_id, - prefix_pos=prefix_pos) - - # Streaming case - async def stream_results() -> AsyncGenerator[bytes, None]: - async for request_output in results_generator: - prompt = request_output.prompt - text_outputs = [ - prompt + output.text for output in request_output.outputs - ] - ret = {"text": text_outputs} - yield (json.dumps(ret) + "\0").encode("utf-8") - - if stream: - return StreamingResponse(stream_results()) - - # Non-streaming case - final_output = None - async for request_output in results_generator: - if await request.is_disconnected(): - # Abort the request if the client disconnects. - await engine.abort(request_id) - return Response(status_code=499) - final_output = request_output - - assert final_output is not None - prompt = final_output.prompt - text_outputs = [prompt + output.text for output in final_output.outputs] - ret = {"text": text_outputs} - return JSONResponse(ret) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--host", type=str, default=None) - parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--ssl-keyfile", type=str, default=None) - parser.add_argument("--ssl-certfile", type=str, default=None) - parser.add_argument( - "--root-path", - type=str, - default=None, - help="FastAPI root_path when app is behind a path based routing proxy") - parser = AsyncEngineArgs.add_cli_args(parser) - args = parser.parse_args() - - engine_args = AsyncEngineArgs.from_cli_args(args) - engine = AsyncLLMEngine.from_engine_args(engine_args) - - app.root_path = args.root_path - uvicorn.run(app, - host=args.host, - port=args.port, - log_level="debug", - timeout_keep_alive=TIMEOUT_KEEP_ALIVE, - ssl_keyfile=args.ssl_keyfile, - ssl_certfile=args.ssl_certfile) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py deleted file mode 100644 index 614e6fa520c8cee36ac7a65182785db5473c568f..0000000000000000000000000000000000000000 --- a/vllm/entrypoints/llm.py +++ /dev/null @@ -1,220 +0,0 @@ -from typing import List, Optional, Union - -from tqdm import tqdm -from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast - -from vllm.lora.request import LoRARequest -from vllm.engine.arg_utils import EngineArgs -from vllm.engine.llm_engine import LLMEngine -from vllm.outputs import RequestOutput -from vllm.sampling_params import SamplingParams -from vllm.utils import Counter - - -class LLM: - """An LLM for generating texts from given prompts and sampling parameters. - - This class includes a tokenizer, a language model (possibly distributed - across multiple GPUs), and GPU memory space allocated for intermediate - states (aka KV cache). Given a batch of prompts and sampling parameters, - this class generates texts from the model, using an intelligent batching - mechanism and efficient memory management. - - NOTE: This class is intended to be used for offline inference. For online - serving, use the `AsyncLLMEngine` class instead. - NOTE: For the comprehensive list of arguments, see `EngineArgs`. - - Args: - model: The name or path of a HuggingFace Transformers model. - tokenizer: The name or path of a HuggingFace Transformers tokenizer. - tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer - if available, and "slow" will always use the slow tokenizer. - trust_remote_code: Trust remote code (e.g., from HuggingFace) when - downloading the model and tokenizer. - tensor_parallel_size: The number of GPUs to use for distributed - execution with tensor parallelism. - dtype: The data type for the model weights and activations. Currently, - we support `float32`, `float16`, and `bfloat16`. If `auto`, we use - the `torch_dtype` attribute specified in the model config file. - However, if the `torch_dtype` in the config is `float32`, we will - use `float16` instead. - quantization: The method used to quantize the model weights. Currently, - we support "awq", "gptq" and "squeezellm". If None, we first check - the `quantization_config` attribute in the model config file. If - that is None, we assume the model weights are not quantized and use - `dtype` to determine the data type of the weights. - revision: The specific model version to use. It can be a branch name, - a tag name, or a commit id. - tokenizer_revision: The specific tokenizer version to use. It can be a - branch name, a tag name, or a commit id. - seed: The seed to initialize the random number generator for sampling. - gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to - reserve for the model weights, activations, and KV cache. Higher - values will increase the KV cache size and thus improve the model's - throughput. However, if the value is too high, it may cause out-of- - memory (OOM) errors. - swap_space: The size (GiB) of CPU memory per GPU to use as swap space. - This can be used for temporarily storing the states of the requests - when their `best_of` sampling parameters are larger than 1. If all - requests will have `best_of=1`, you can safely set this to 0. - Otherwise, too small values may cause out-of-memory (OOM) errors. - enforce_eager: Whether to enforce eager execution. If True, we will - disable CUDA graph and always execute the model in eager mode. - If False, we will use CUDA graph and eager execution in hybrid. - max_context_len_to_capture: Maximum context len covered by CUDA graphs. - When a sequence has context length larger than this, we fall back - to eager mode. - disable_custom_all_reduce: See ParallelConfig - """ - - def __init__( - self, - model: str, - tokenizer: Optional[str] = None, - tokenizer_mode: str = "auto", - trust_remote_code: bool = False, - tensor_parallel_size: int = 1, - dtype: str = "auto", - quantization: Optional[str] = None, - revision: Optional[str] = None, - tokenizer_revision: Optional[str] = None, - seed: int = 0, - gpu_memory_utilization: float = 0.9, - swap_space: int = 4, - enforce_eager: bool = False, - max_context_len_to_capture: int = 8192, - disable_custom_all_reduce: bool = False, - **kwargs, - ) -> None: - if "disable_log_stats" not in kwargs: - kwargs["disable_log_stats"] = True - engine_args = EngineArgs( - model=model, - tokenizer=tokenizer, - tokenizer_mode=tokenizer_mode, - trust_remote_code=trust_remote_code, - tensor_parallel_size=tensor_parallel_size, - dtype=dtype, - quantization=quantization, - revision=revision, - tokenizer_revision=tokenizer_revision, - seed=seed, - gpu_memory_utilization=gpu_memory_utilization, - swap_space=swap_space, - enforce_eager=enforce_eager, - max_context_len_to_capture=max_context_len_to_capture, - disable_custom_all_reduce=disable_custom_all_reduce, - **kwargs, - ) - self.llm_engine = LLMEngine.from_engine_args(engine_args) - self.request_counter = Counter() - - def get_tokenizer( - self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: - return self.llm_engine.tokenizer - - def set_tokenizer( - self, - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - ) -> None: - self.llm_engine.tokenizer = tokenizer - - def generate( - self, - prompts: Optional[Union[str, List[str]]] = None, - sampling_params: Optional[SamplingParams] = None, - prompt_token_ids: Optional[List[List[int]]] = None, - prefix_pos: Optional[Union[int, List[int]]] = None, - use_tqdm: bool = True, - lora_request: Optional[LoRARequest] = None, - ) -> List[RequestOutput]: - """Generates the completions for the input prompts. - - NOTE: This class automatically batches the given prompts, considering - the memory constraint. For the best performance, put all of your prompts - into a single list and pass it to this method. - - Args: - prompts: A list of prompts to generate completions for. - sampling_params: The sampling parameters for text generation. If - None, we use the default sampling parameters. - prompt_token_ids: A list of token IDs for the prompts. If None, we - use the tokenizer to convert the prompts to token IDs. - prefix_pos: If not None, we use the given position as the prefix - position for each prompt. We will cache the prefix's KV - cache and reuse it for the next request with the same prefix. - This is an experimental feature, and may be replaced with - automatic prefix caching in the future. - use_tqdm: Whether to use tqdm to display the progress bar. - lora_request: LoRA request to use for generation, if any. - - Returns: - A list of `RequestOutput` objects containing the generated - completions in the same order as the input prompts. - """ - if prompts is None and prompt_token_ids is None: - raise ValueError("Either prompts or prompt_token_ids must be " - "provided.") - if isinstance(prompts, str): - # Convert a single prompt to a list. - prompts = [prompts] - if (prompts is not None and prompt_token_ids is not None - and len(prompts) != len(prompt_token_ids)): - raise ValueError("The lengths of prompts and prompt_token_ids " - "must be the same.") - if sampling_params is None: - # Use default sampling params. - sampling_params = SamplingParams() - - # Add requests to the engine. - num_requests = len(prompts) if prompts is not None else len( - prompt_token_ids) - for i in range(num_requests): - prompt = prompts[i] if prompts is not None else None - prefix_pos_i = prefix_pos[i] if prefix_pos is not None else None - token_ids = None if prompt_token_ids is None else prompt_token_ids[ - i] - self._add_request(prompt, - sampling_params, - token_ids, - lora_request=lora_request, - prefix_pos=prefix_pos_i) - return self._run_engine(use_tqdm) - - def _add_request( - self, - prompt: Optional[str], - sampling_params: SamplingParams, - prompt_token_ids: Optional[List[int]], - lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, - ) -> None: - request_id = str(next(self.request_counter)) - self.llm_engine.add_request(request_id, - prompt, - sampling_params, - prompt_token_ids, - lora_request=lora_request, - prefix_pos=prefix_pos) - - def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]: - # Initialize tqdm. - if use_tqdm: - num_requests = self.llm_engine.get_num_unfinished_requests() - pbar = tqdm(total=num_requests, desc="Processed prompts") - # Run the engine. - outputs: List[RequestOutput] = [] - while self.llm_engine.has_unfinished_requests(): - step_outputs = self.llm_engine.step() - for output in step_outputs: - if output.finished: - outputs.append(output) - if use_tqdm: - pbar.update(1) - if use_tqdm: - pbar.close() - # Sort the outputs by request ID. - # This is necessary because some requests may be finished earlier than - # its previous requests. - outputs = sorted(outputs, key=lambda x: int(x.request_id)) - return outputs diff --git a/vllm/entrypoints/openai/__init__.py b/vllm/entrypoints/openai/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py deleted file mode 100644 index deb0fddd643cc59df9c9326c9cb1e19b04585b0a..0000000000000000000000000000000000000000 --- a/vllm/entrypoints/openai/api_server.py +++ /dev/null @@ -1,233 +0,0 @@ -import argparse -import asyncio -import json -from contextlib import asynccontextmanager -import os -import importlib -import inspect - -from aioprometheus import MetricsMiddleware -from aioprometheus.asgi.starlette import metrics -import fastapi -import uvicorn -from http import HTTPStatus -from fastapi import Request -from fastapi.exceptions import RequestValidationError -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, StreamingResponse, Response - -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.engine.metrics import add_global_metrics_labels -from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest, ErrorResponse -from vllm.logger import init_logger -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat -from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion - -TIMEOUT_KEEP_ALIVE = 5 # seconds - -openai_serving_chat: OpenAIServingChat = None -openai_serving_completion: OpenAIServingCompletion = None -logger = init_logger(__name__) - - -@asynccontextmanager -async def lifespan(app: fastapi.FastAPI): - - async def _force_log(): - while True: - await asyncio.sleep(10) - await engine.do_log_stats() - - if not engine_args.disable_log_stats: - asyncio.create_task(_force_log()) - - yield - - -app = fastapi.FastAPI(lifespan=lifespan) - - -def parse_args(): - parser = argparse.ArgumentParser( - description="vLLM OpenAI-Compatible RESTful API server.") - parser.add_argument("--host", type=str, default=None, help="host name") - parser.add_argument("--port", type=int, default=8000, help="port number") - parser.add_argument("--allow-credentials", - action="store_true", - help="allow credentials") - parser.add_argument("--allowed-origins", - type=json.loads, - default=["*"], - help="allowed origins") - parser.add_argument("--allowed-methods", - type=json.loads, - default=["*"], - help="allowed methods") - parser.add_argument("--allowed-headers", - type=json.loads, - default=["*"], - help="allowed headers") - parser.add_argument( - "--api-key", - type=str, - default=None, - help= - "If provided, the server will require this key to be presented in the header." - ) - parser.add_argument("--served-model-name", - type=str, - default=None, - help="The model name used in the API. If not " - "specified, the model name will be the same as " - "the huggingface name.") - parser.add_argument("--chat-template", - type=str, - default=None, - help="The file path to the chat template, " - "or the template in single-line form " - "for the specified model") - parser.add_argument("--response-role", - type=str, - default="assistant", - help="The role name to return if " - "`request.add_generation_prompt=true`.") - parser.add_argument("--ssl-keyfile", - type=str, - default=None, - help="The file path to the SSL key file") - parser.add_argument("--ssl-certfile", - type=str, - default=None, - help="The file path to the SSL cert file") - parser.add_argument( - "--root-path", - type=str, - default=None, - help="FastAPI root_path when app is behind a path based routing proxy") - parser.add_argument( - "--middleware", - type=str, - action="append", - default=[], - help="Additional ASGI middleware to apply to the app. " - "We accept multiple --middleware arguments. " - "The value should be an import path. " - "If a function is provided, vLLM will add it to the server using @app.middleware('http'). " - "If a class is provided, vLLM will add it to the server using app.add_middleware(). " - ) - - parser = AsyncEngineArgs.add_cli_args(parser) - return parser.parse_args() - - -app.add_middleware(MetricsMiddleware) # Trace HTTP server metrics -app.add_route("/metrics", metrics) # Exposes HTTP metrics - - -@app.exception_handler(RequestValidationError) -async def validation_exception_handler(_, exc): - err = openai_serving_chat.create_error_response(message=str(exc)) - return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) - - -@app.get("/health") -async def health() -> Response: - """Health check.""" - return Response(status_code=200) - - -@app.get("/v1/models") -async def show_available_models(): - models = await openai_serving_chat.show_available_models() - return JSONResponse(content=models.model_dump()) - - -@app.post("/v1/chat/completions") -async def create_chat_completion(request: ChatCompletionRequest, - raw_request: Request): - generator = await openai_serving_chat.create_chat_completion( - request, raw_request) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), - status_code=generator.code) - if request.stream: - return StreamingResponse(content=generator, - media_type="text/event-stream") - else: - return JSONResponse(content=generator.model_dump()) - - -@app.post("/v1/completions") -async def create_completion(request: CompletionRequest, raw_request: Request): - generator = await openai_serving_completion.create_completion( - request, raw_request) - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), - status_code=generator.code) - if request.stream: - return StreamingResponse(content=generator, - media_type="text/event-stream") - else: - return JSONResponse(content=generator.model_dump()) - - -if __name__ == "__main__": - args = parse_args() - - app.add_middleware( - CORSMiddleware, - allow_origins=args.allowed_origins, - allow_credentials=args.allow_credentials, - allow_methods=args.allowed_methods, - allow_headers=args.allowed_headers, - ) - - if token := os.environ.get("VLLM_API_KEY") or args.api_key: - - @app.middleware("http") - async def authentication(request: Request, call_next): - if not request.url.path.startswith("/v1"): - return await call_next(request) - if request.headers.get("Authorization") != "Bearer " + token: - return JSONResponse(content={"error": "Unauthorized"}, - status_code=401) - return await call_next(request) - - for middleware in args.middleware: - module_path, object_name = middleware.rsplit(".", 1) - imported = getattr(importlib.import_module(module_path), object_name) - if inspect.isclass(imported): - app.add_middleware(imported) - elif inspect.iscoroutinefunction(imported): - app.middleware("http")(imported) - else: - raise ValueError( - f"Invalid middleware {middleware}. Must be a function or a class." - ) - - logger.info(f"args: {args}") - - if args.served_model_name is not None: - served_model = args.served_model_name - else: - served_model = args.model - - engine_args = AsyncEngineArgs.from_cli_args(args) - engine = AsyncLLMEngine.from_engine_args(engine_args) - openai_serving_chat = OpenAIServingChat(engine, served_model, - args.response_role, - args.chat_template) - openai_serving_completion = OpenAIServingCompletion(engine, served_model) - - # Register labels for metrics - add_global_metrics_labels(model_name=engine_args.model) - - app.root_path = args.root_path - uvicorn.run(app, - host=args.host, - port=args.port, - log_level="info", - timeout_keep_alive=TIMEOUT_KEEP_ALIVE, - ssl_keyfile=args.ssl_keyfile, - ssl_certfile=args.ssl_certfile) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py deleted file mode 100644 index fc15b7833ecf24ab5a44dbeb1ea7d4736f284768..0000000000000000000000000000000000000000 --- a/vllm/entrypoints/openai/protocol.py +++ /dev/null @@ -1,240 +0,0 @@ -# Adapted from -# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py -import time -from typing import Dict, List, Literal, Optional, Union - -from pydantic import BaseModel, Field - -from vllm.utils import random_uuid -from vllm.sampling_params import SamplingParams - - -class ErrorResponse(BaseModel): - object: str = "error" - message: str - type: str - param: Optional[str] = None - code: int - - -class ModelPermission(BaseModel): - id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}") - object: str = "model_permission" - created: int = Field(default_factory=lambda: int(time.time())) - allow_create_engine: bool = False - allow_sampling: bool = True - allow_logprobs: bool = True - allow_search_indices: bool = False - allow_view: bool = True - allow_fine_tuning: bool = False - organization: str = "*" - group: Optional[str] = None - is_blocking: str = False - - -class ModelCard(BaseModel): - id: str - object: str = "model" - created: int = Field(default_factory=lambda: int(time.time())) - owned_by: str = "vllm" - root: Optional[str] = None - parent: Optional[str] = None - permission: List[ModelPermission] = Field(default_factory=list) - - -class ModelList(BaseModel): - object: str = "list" - data: List[ModelCard] = Field(default_factory=list) - - -class UsageInfo(BaseModel): - prompt_tokens: int = 0 - total_tokens: int = 0 - completion_tokens: Optional[int] = 0 - - -class ChatCompletionRequest(BaseModel): - model: str - messages: Union[str, List[Dict[str, str]]] - temperature: Optional[float] = 0.7 - top_p: Optional[float] = 1.0 - n: Optional[int] = 1 - max_tokens: Optional[int] = None - stop: Optional[Union[str, List[str]]] = Field(default_factory=list) - stream: Optional[bool] = False - presence_penalty: Optional[float] = 0.0 - frequency_penalty: Optional[float] = 0.0 - logit_bias: Optional[Dict[str, float]] = None - user: Optional[str] = None - # Additional parameters supported by vLLM - best_of: Optional[int] = None - top_k: Optional[int] = -1 - ignore_eos: Optional[bool] = False - use_beam_search: Optional[bool] = False - stop_token_ids: Optional[List[int]] = Field(default_factory=list) - skip_special_tokens: Optional[bool] = True - spaces_between_special_tokens: Optional[bool] = True - add_generation_prompt: Optional[bool] = True - echo: Optional[bool] = False - repetition_penalty: Optional[float] = 1.0 - min_p: Optional[float] = 0.0 - include_stop_str_in_output: Optional[bool] = False - length_penalty: Optional[float] = 1.0 - - def to_sampling_params(self) -> SamplingParams: - return SamplingParams( - n=self.n, - presence_penalty=self.presence_penalty, - frequency_penalty=self.frequency_penalty, - repetition_penalty=self.repetition_penalty, - temperature=self.temperature, - top_p=self.top_p, - min_p=self.min_p, - stop=self.stop, - stop_token_ids=self.stop_token_ids, - max_tokens=self.max_tokens, - best_of=self.best_of, - top_k=self.top_k, - ignore_eos=self.ignore_eos, - use_beam_search=self.use_beam_search, - skip_special_tokens=self.skip_special_tokens, - spaces_between_special_tokens=self.spaces_between_special_tokens, - include_stop_str_in_output=self.include_stop_str_in_output, - length_penalty=self.length_penalty, - ) - - -class CompletionRequest(BaseModel): - model: str - # a string, array of strings, array of tokens, or array of token arrays - prompt: Union[List[int], List[List[int]], str, List[str]] - suffix: Optional[str] = None - max_tokens: Optional[int] = 16 - temperature: Optional[float] = 1.0 - top_p: Optional[float] = 1.0 - n: Optional[int] = 1 - stream: Optional[bool] = False - logprobs: Optional[int] = None - echo: Optional[bool] = False - stop: Optional[Union[str, List[str]]] = Field(default_factory=list) - presence_penalty: Optional[float] = 0.0 - frequency_penalty: Optional[float] = 0.0 - best_of: Optional[int] = None - logit_bias: Optional[Dict[str, float]] = None - user: Optional[str] = None - # Additional parameters supported by vLLM - top_k: Optional[int] = -1 - ignore_eos: Optional[bool] = False - use_beam_search: Optional[bool] = False - stop_token_ids: Optional[List[int]] = Field(default_factory=list) - skip_special_tokens: Optional[bool] = True - spaces_between_special_tokens: Optional[bool] = True - repetition_penalty: Optional[float] = 1.0 - min_p: Optional[float] = 0.0 - include_stop_str_in_output: Optional[bool] = False - length_penalty: Optional[float] = 1.0 - - def to_sampling_params(self): - echo_without_generation = self.echo and self.max_tokens == 0 - - return SamplingParams( - n=self.n, - best_of=self.best_of, - presence_penalty=self.presence_penalty, - frequency_penalty=self.frequency_penalty, - repetition_penalty=self.repetition_penalty, - temperature=self.temperature, - top_p=self.top_p, - top_k=self.top_k, - min_p=self.min_p, - stop=self.stop, - stop_token_ids=self.stop_token_ids, - ignore_eos=self.ignore_eos, - max_tokens=self.max_tokens if not echo_without_generation else 1, - logprobs=self.logprobs, - use_beam_search=self.use_beam_search, - prompt_logprobs=self.logprobs if self.echo else None, - skip_special_tokens=self.skip_special_tokens, - spaces_between_special_tokens=(self.spaces_between_special_tokens), - include_stop_str_in_output=self.include_stop_str_in_output, - length_penalty=self.length_penalty, - ) - - -class LogProbs(BaseModel): - text_offset: List[int] = Field(default_factory=list) - token_logprobs: List[Optional[float]] = Field(default_factory=list) - tokens: List[str] = Field(default_factory=list) - top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None - - -class CompletionResponseChoice(BaseModel): - index: int - text: str - logprobs: Optional[LogProbs] = None - finish_reason: Optional[Literal["stop", "length"]] = None - - -class CompletionResponse(BaseModel): - id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}") - object: str = "text_completion" - created: int = Field(default_factory=lambda: int(time.time())) - model: str - choices: List[CompletionResponseChoice] - usage: UsageInfo - - -class CompletionResponseStreamChoice(BaseModel): - index: int - text: str - logprobs: Optional[LogProbs] = None - finish_reason: Optional[Literal["stop", "length"]] = None - - -class CompletionStreamResponse(BaseModel): - id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}") - object: str = "text_completion" - created: int = Field(default_factory=lambda: int(time.time())) - model: str - choices: List[CompletionResponseStreamChoice] - usage: Optional[UsageInfo] = Field(default=None) - - -class ChatMessage(BaseModel): - role: str - content: str - - -class ChatCompletionResponseChoice(BaseModel): - index: int - message: ChatMessage - finish_reason: Optional[Literal["stop", "length"]] = None - - -class ChatCompletionResponse(BaseModel): - id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}") - object: str = "chat.completion" - created: int = Field(default_factory=lambda: int(time.time())) - model: str - choices: List[ChatCompletionResponseChoice] - usage: UsageInfo - - -class DeltaMessage(BaseModel): - role: Optional[str] = None - content: Optional[str] = None - - -class ChatCompletionResponseStreamChoice(BaseModel): - index: int - delta: DeltaMessage - finish_reason: Optional[Literal["stop", "length"]] = None - - -class ChatCompletionStreamResponse(BaseModel): - id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}") - object: str = "chat.completion.chunk" - created: int = Field(default_factory=lambda: int(time.time())) - model: str - choices: List[ChatCompletionResponseStreamChoice] - usage: Optional[UsageInfo] = Field(default=None) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py deleted file mode 100644 index a9e4c355560b8ae056dc3329dd47e37b95abd532..0000000000000000000000000000000000000000 --- a/vllm/entrypoints/openai/serving_chat.py +++ /dev/null @@ -1,265 +0,0 @@ -import time -import codecs -from fastapi import Request -from typing import AsyncGenerator, AsyncIterator, Union -from vllm.logger import init_logger -from vllm.utils import random_uuid -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.entrypoints.openai.protocol import ( - ChatCompletionRequest, ChatCompletionResponse, - ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, - ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse, - UsageInfo) -from vllm.outputs import RequestOutput -from vllm.entrypoints.openai.serving_engine import OpenAIServing - -logger = init_logger(__name__) - - -class OpenAIServingChat(OpenAIServing): - - def __init__(self, - engine: AsyncLLMEngine, - served_model: str, - response_role: str, - chat_template=None): - super().__init__(engine=engine, served_model=served_model) - self.response_role = response_role - self._load_chat_template(chat_template) - - async def create_chat_completion( - self, request: ChatCompletionRequest, raw_request: Request - ) -> Union[ErrorResponse, AsyncGenerator[str, None], - ChatCompletionResponse]: - """Completion API similar to OpenAI's API. - - See https://platform.openai.com/docs/api-reference/chat/create - for the API specification. This API mimics the OpenAI ChatCompletion API. - - NOTE: Currently we do not support the following features: - - function_call (Users should implement this by themselves) - - logit_bias (to be supported by vLLM engine) - """ - error_check_ret = await self._check_model(request) - if error_check_ret is not None: - return error_check_ret - - if request.logit_bias is not None and len(request.logit_bias) > 0: - # TODO: support logit_bias in vLLM engine. - return self.create_error_response( - "logit_bias is not currently supported") - - try: - prompt = self.tokenizer.apply_chat_template( - conversation=request.messages, - tokenize=False, - add_generation_prompt=request.add_generation_prompt) - except Exception as e: - logger.error( - f"Error in applying chat template from request: {str(e)}") - return self.create_error_response(str(e)) - - request_id = f"cmpl-{random_uuid()}" - try: - token_ids = self._validate_prompt_and_tokenize(request, - prompt=prompt) - sampling_params = request.to_sampling_params() - except ValueError as e: - return self.create_error_response(str(e)) - - result_generator = self.engine.generate(prompt, sampling_params, - request_id, token_ids) - # Streaming response - if request.stream: - return self.chat_completion_stream_generator( - request, result_generator, request_id) - else: - return await self.chat_completion_full_generator( - request, raw_request, result_generator, request_id) - - def get_chat_request_role(self, request: ChatCompletionRequest) -> str: - if request.add_generation_prompt: - return self.response_role - else: - return request.messages[-1].role - - async def chat_completion_stream_generator( - self, request: ChatCompletionRequest, - result_generator: AsyncIterator[RequestOutput], request_id: str - ) -> Union[ErrorResponse, AsyncGenerator[str, None]]: - - model_name = request.model - created_time = int(time.monotonic()) - chunk_object_type = "chat.completion.chunk" - - # Send first response for each request.n (index) with the role - role = self.get_chat_request_role(request) - for i in range(request.n): - choice_data = ChatCompletionResponseStreamChoice( - index=i, delta=DeltaMessage(role=role), finish_reason=None) - chunk = ChatCompletionStreamResponse(id=request_id, - object=chunk_object_type, - created=created_time, - choices=[choice_data], - model=model_name) - data = chunk.model_dump_json(exclude_unset=True) - yield f"data: {data}\n\n" - - # Send response to echo the input portion of the last message - if request.echo: - last_msg_content = "" - if request.messages and isinstance( - request.messages, list) and request.messages[-1].get( - "content") and request.messages[-1].get( - "role") == role: - last_msg_content = request.messages[-1]["content"] - if last_msg_content: - for i in range(request.n): - choice_data = ChatCompletionResponseStreamChoice( - index=i, - delta=DeltaMessage(content=last_msg_content), - finish_reason=None) - chunk = ChatCompletionStreamResponse( - id=request_id, - object=chunk_object_type, - created=created_time, - choices=[choice_data], - model=model_name) - data = chunk.model_dump_json(exclude_unset=True) - yield f"data: {data}\n\n" - - # Send response for each token for each request.n (index) - previous_texts = [""] * request.n - previous_num_tokens = [0] * request.n - finish_reason_sent = [False] * request.n - async for res in result_generator: - res: RequestOutput - for output in res.outputs: - i = output.index - - if finish_reason_sent[i]: - continue - - delta_text = output.text[len(previous_texts[i]):] - previous_texts[i] = output.text - previous_num_tokens[i] = len(output.token_ids) - - if output.finish_reason is None: - # Send token-by-token response for each request.n - choice_data = ChatCompletionResponseStreamChoice( - index=i, - delta=DeltaMessage(content=delta_text), - finish_reason=None) - chunk = ChatCompletionStreamResponse( - id=request_id, - object=chunk_object_type, - created=created_time, - choices=[choice_data], - model=model_name) - data = chunk.model_dump_json(exclude_unset=True) - yield f"data: {data}\n\n" - else: - # Send the finish response for each request.n only once - prompt_tokens = len(res.prompt_token_ids) - final_usage = UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=previous_num_tokens[i], - total_tokens=prompt_tokens + previous_num_tokens[i], - ) - choice_data = ChatCompletionResponseStreamChoice( - index=i, - delta=DeltaMessage(content=delta_text), - finish_reason=output.finish_reason) - chunk = ChatCompletionStreamResponse( - id=request_id, - object=chunk_object_type, - created=created_time, - choices=[choice_data], - model=model_name) - if final_usage is not None: - chunk.usage = final_usage - data = chunk.model_dump_json(exclude_unset=True, - exclude_none=True) - yield f"data: {data}\n\n" - finish_reason_sent[i] = True - # Send the final done message after all response.n are finished - yield "data: [DONE]\n\n" - - async def chat_completion_full_generator( - self, request: ChatCompletionRequest, raw_request: Request, - result_generator: AsyncIterator[RequestOutput], - request_id: str) -> Union[ErrorResponse, ChatCompletionResponse]: - - model_name = request.model - created_time = int(time.monotonic()) - final_res: RequestOutput = None - - async for res in result_generator: - if await raw_request.is_disconnected(): - # Abort the request if the client disconnects. - await self.engine.abort(request_id) - return self.create_error_response("Client disconnected") - final_res = res - assert final_res is not None - - choices = [] - role = self.get_chat_request_role(request) - for output in final_res.outputs: - choice_data = ChatCompletionResponseChoice( - index=output.index, - message=ChatMessage(role=role, content=output.text), - finish_reason=output.finish_reason, - ) - choices.append(choice_data) - - if request.echo: - last_msg_content = "" - if request.messages and isinstance( - request.messages, list) and request.messages[-1].get( - "content") and request.messages[-1].get( - "role") == role: - last_msg_content = request.messages[-1]["content"] - - for choice in choices: - full_message = last_msg_content + choice.message.content - choice.message.content = full_message - - num_prompt_tokens = len(final_res.prompt_token_ids) - num_generated_tokens = sum( - len(output.token_ids) for output in final_res.outputs) - usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - completion_tokens=num_generated_tokens, - total_tokens=num_prompt_tokens + num_generated_tokens, - ) - response = ChatCompletionResponse( - id=request_id, - created=created_time, - model=model_name, - choices=choices, - usage=usage, - ) - - return response - - def _load_chat_template(self, chat_template): - if chat_template is not None: - try: - with open(chat_template, "r") as f: - self.tokenizer.chat_template = f.read() - except OSError: - # If opening a file fails, set chat template to be args to - # ensure we decode so our escape are interpreted correctly - self.tokenizer.chat_template = codecs.decode( - chat_template, "unicode_escape") - - logger.info( - f"Using supplied chat template:\n{self.tokenizer.chat_template}" - ) - elif self.tokenizer.chat_template is not None: - logger.info( - f"Using default chat template:\n{self.tokenizer.chat_template}" - ) - else: - logger.warning( - "No chat template provided. Chat API will not work.") diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py deleted file mode 100644 index 8c9a7ad309cea3b6362a8ff9dbfdd49859989469..0000000000000000000000000000000000000000 --- a/vllm/entrypoints/openai/serving_completion.py +++ /dev/null @@ -1,349 +0,0 @@ -import asyncio -import time -from fastapi import Request -from typing import AsyncGenerator, AsyncIterator, Callable, List, Optional -from vllm.logger import init_logger -from vllm.utils import random_uuid -from vllm.engine.async_llm_engine import AsyncLLMEngine -from .protocol import ( - CompletionRequest, - CompletionResponse, - CompletionResponseChoice, - CompletionResponseStreamChoice, - CompletionStreamResponse, - LogProbs, - UsageInfo, -) -from vllm.outputs import RequestOutput -from vllm.entrypoints.openai.serving_engine import OpenAIServing - -logger = init_logger(__name__) - -TypeTokenIDs = list[int] -TypeTopLogProbs = List[Optional[dict[int, float]]] -TypeCreateLogProbsFn = Callable[ - [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], LogProbs] - - -async def completion_stream_generator( - request: CompletionRequest, - raw_request: Request, - on_abort, - result_generator: AsyncIterator[tuple[int, RequestOutput]], - create_logprobs_fn: TypeCreateLogProbsFn, - request_id: str, - created_time: int, - model_name: str, - num_prompts: int, -) -> AsyncGenerator[str, None]: - previous_texts = [""] * request.n * num_prompts - previous_num_tokens = [0] * request.n * num_prompts - has_echoed = [False] * request.n * num_prompts - - async for prompt_idx, res in result_generator: - - # Abort the request if the client disconnects. - if await raw_request.is_disconnected(): - await on_abort(f"{request_id}-{prompt_idx}") - raise StopAsyncIteration() - - for output in res.outputs: - i = output.index + prompt_idx * request.n - # TODO(simon): optimize the performance by avoiding full text O(n^2) sending. - - if request.echo and request.max_tokens == 0: - # only return the prompt - delta_text = res.prompt - delta_token_ids = res.prompt_token_ids - top_logprobs = res.prompt_logprobs - has_echoed[i] = True - elif request.echo and request.max_tokens > 0 and not has_echoed[i]: - # echo the prompt and first token - delta_text = res.prompt + output.text - delta_token_ids = res.prompt_token_ids + output.token_ids - top_logprobs = res.prompt_logprobs + (output.logprobs or []) - has_echoed[i] = True - else: - # return just the delta - delta_text = output.text[len(previous_texts[i]):] - delta_token_ids = output.token_ids[previous_num_tokens[i]:] - top_logprobs = output.logprobs[ - previous_num_tokens[i]:] if output.logprobs else None - - if request.logprobs is not None: - assert top_logprobs is not None, "top_logprobs must be provided when logprobs is requested" - logprobs = create_logprobs_fn( - token_ids=delta_token_ids, - top_logprobs=top_logprobs, - num_output_top_logprobs=request.logprobs, - initial_text_offset=len(previous_texts[i]), - ) - else: - logprobs = None - - previous_texts[i] = output.text - previous_num_tokens[i] = len(output.token_ids) - finish_reason = output.finish_reason - response_json = CompletionStreamResponse( - id=request_id, - created=created_time, - model=model_name, - choices=[ - CompletionResponseStreamChoice( - index=i, - text=delta_text, - logprobs=logprobs, - finish_reason=finish_reason, - ) - ]).model_dump_json(exclude_unset=True) - yield f"data: {response_json}\n\n" - - if output.finish_reason is not None: # return final usage - logprobs = LogProbs() if request.logprobs is not None else None - prompt_tokens = len(res.prompt_token_ids) - completion_tokens = len(output.token_ids) - final_usage = UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, - ) - response_json = CompletionStreamResponse( - id=request_id, - created=created_time, - model=model_name, - choices=[ - CompletionResponseStreamChoice( - index=i, - text="", - logprobs=logprobs, - finish_reason=output.finish_reason, - ) - ], - usage=final_usage, - ).model_dump_json(exclude_unset=True) - yield f"data: {response_json}\n\n" - - yield "data: [DONE]\n\n" - - -def parse_prompt_format(prompt) -> tuple[bool, list]: - # get the prompt, openai supports the following - # "a string, array of strings, array of tokens, or array of token arrays." - prompt_is_tokens = False - prompts = [prompt] # case 1: a string - if isinstance(prompt, list): - if len(prompt) == 0: - raise ValueError("please provide at least one prompt") - elif isinstance(prompt[0], str): - prompt_is_tokens = False - prompts = prompt # case 2: array of strings - elif isinstance(prompt[0], int): - prompt_is_tokens = True - prompts = [prompt] # case 3: array of tokens - elif isinstance(prompt[0], list) and isinstance(prompt[0][0], int): - prompt_is_tokens = True - prompts = prompt # case 4: array of token arrays - else: - raise ValueError( - "prompt must be a string, array of strings, array of tokens, or array of token arrays" - ) - return prompt_is_tokens, prompts - - -def request_output_to_completion_response( - final_res_batch: list[RequestOutput], - request: CompletionRequest, - create_logprobs_fn: TypeCreateLogProbsFn, - request_id: str, - created_time: int, - model_name: str, -) -> CompletionResponse: - choices = [] - num_prompt_tokens = 0 - num_generated_tokens = 0 - for final_res in final_res_batch: - assert final_res is not None - prompt_token_ids = final_res.prompt_token_ids - prompt_logprobs = final_res.prompt_logprobs - prompt_text = final_res.prompt - - for output in final_res.outputs: - if request.echo and request.max_tokens == 0: - token_ids = prompt_token_ids - top_logprobs = prompt_logprobs - output_text = prompt_text - elif request.echo and request.max_tokens > 0: - token_ids = prompt_token_ids + output.token_ids - top_logprobs = prompt_logprobs + output.logprobs - output_text = prompt_text + output.text - else: - token_ids = output.token_ids - top_logprobs = output.logprobs - output_text = output.text - - if request.logprobs is not None: - logprobs = create_logprobs_fn( - token_ids=token_ids, - top_logprobs=top_logprobs, - num_output_top_logprobs=request.logprobs, - ) - else: - logprobs = None - - choice_data = CompletionResponseChoice( - index=len(choices), - text=output_text, - logprobs=logprobs, - finish_reason=output.finish_reason, - ) - choices.append(choice_data) - - num_prompt_tokens += len(prompt_token_ids) - num_generated_tokens += sum( - len(output.token_ids) for output in final_res.outputs) - - usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - completion_tokens=num_generated_tokens, - total_tokens=num_prompt_tokens + num_generated_tokens, - ) - - return CompletionResponse( - id=request_id, - created=created_time, - model=model_name, - choices=choices, - usage=usage, - ) - - -def merge_async_iterators(*iterators): - """Merge multiple asynchronous iterators into a single iterator. - - This method handle the case where some iterators finish before others. - When it yields, it yields a tuple (i, item) where i is the index of the - iterator that yields the item. - """ - queue = asyncio.Queue() - - finished = [False] * len(iterators) - - async def producer(i, iterator): - async for item in iterator: - await queue.put((i, item)) - finished[i] = True - - _tasks = [ - asyncio.create_task(producer(i, iterator)) - for i, iterator in enumerate(iterators) - ] - - async def consumer(): - while not all(finished) or not queue.empty(): - item = await queue.get() - yield item - await asyncio.gather(*_tasks) - - return consumer() - - -class OpenAIServingCompletion(OpenAIServing): - - def __init__(self, engine: AsyncLLMEngine, served_model: str): - super().__init__(engine=engine, served_model=served_model) - - async def create_completion(self, request: CompletionRequest, - raw_request: Request): - """Completion API similar to OpenAI's API. - - See https://platform.openai.com/docs/api-reference/completions/create - for the API specification. This API mimics the OpenAI Completion API. - - NOTE: Currently we do not support the following features: - - suffix (the language models we currently support do not support - suffix) - - logit_bias (to be supported by vLLM engine) - """ - error_check_ret = await self._check_model(request) - if error_check_ret is not None: - return error_check_ret - - # Return error for unsupported features. - if request.suffix is not None: - return self.create_error_response( - "suffix is not currently supported") - if request.logit_bias is not None and len(request.logit_bias) > 0: - return self.create_error_response( - "logit_bias is not currently supported") - - model_name = request.model - request_id = f"cmpl-{random_uuid()}" - created_time = int(time.monotonic()) - - # Schedule the request and get the result generator. - generators = [] - try: - sampling_params = request.to_sampling_params() - prompt_is_tokens, prompts = parse_prompt_format(request.prompt) - - for i, prompt in enumerate(prompts): - if prompt_is_tokens: - input_ids = self._validate_prompt_and_tokenize( - request, prompt_ids=prompt) - else: - input_ids = self._validate_prompt_and_tokenize( - request, prompt=prompt) - - generators.append( - self.engine.generate(None, - sampling_params, - f"{request_id}-{i}", - prompt_token_ids=input_ids)) - except ValueError as e: - return self.create_error_response(str(e)) - - result_generator: AsyncIterator[tuple[ - int, RequestOutput]] = merge_async_iterators(*generators) - - # Similar to the OpenAI API, when n != best_of, we do not stream the - # results. In addition, we do not stream the results when use beam search. - stream = (request.stream - and (request.best_of is None or request.n == request.best_of) - and not request.use_beam_search) - - # Streaming response - if stream: - return completion_stream_generator(request, - raw_request, - self.engine.abort, - result_generator, - self._create_logprobs, - request_id, - created_time, - model_name, - num_prompts=len(prompts)) - - # Non-streaming response - final_res_batch: RequestOutput = [None] * len(prompts) - async for i, res in result_generator: - if await raw_request.is_disconnected(): - # Abort the request if the client disconnects. - await self.engine.abort(f"{request_id}-{i}") - return self.create_error_response("Client disconnected") - final_res_batch[i] = res - response = request_output_to_completion_response( - final_res_batch, request, self._create_logprobs, request_id, - created_time, model_name) - - # When user requests streaming but we don't stream, we still need to - # return a streaming response with a single event. - if request.stream: - response_json = response.model_dump_json() - - async def fake_stream_generator() -> AsyncGenerator[str, None]: - yield f"data: {response_json}\n\n" - yield "data: [DONE]\n\n" - - return fake_stream_generator() - - return response diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py deleted file mode 100644 index 390f9aeb89217efeda0c3938842dce233315ea0f..0000000000000000000000000000000000000000 --- a/vllm/entrypoints/openai/serving_engine.py +++ /dev/null @@ -1,133 +0,0 @@ -import asyncio -from http import HTTPStatus -from typing import Dict, List, Optional, Union -from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.entrypoints.openai.protocol import (CompletionRequest, - ChatCompletionRequest, - ErrorResponse, LogProbs, - ModelCard, ModelList, - ModelPermission) - -logger = init_logger(__name__) - - -class OpenAIServing: - - def __init__(self, engine: AsyncLLMEngine, served_model: str): - self.engine = engine - self.served_model = served_model - - self.max_model_len = 0 - self.tokenizer = None - - try: - event_loop = asyncio.get_running_loop() - except RuntimeError: - event_loop = None - - if event_loop is not None and event_loop.is_running( - ): # If the current is instanced by Ray Serve, there is already a running event loop - event_loop.create_task(self._post_init()) - else: # When using single vLLM without engine_use_ray - asyncio.run(self._post_init()) - - async def _post_init(self): - engine_model_config = await self.engine.get_model_config() - self.max_model_len = engine_model_config.max_model_len - - # A separate tokenizer to map token IDs to strings. - self.tokenizer = get_tokenizer( - engine_model_config.tokenizer, - tokenizer_mode=engine_model_config.tokenizer_mode, - trust_remote_code=engine_model_config.trust_remote_code) - - async def show_available_models(self) -> ModelList: - """Show available models. Right now we only have one model.""" - model_cards = [ - ModelCard(id=self.served_model, - root=self.served_model, - permission=[ModelPermission()]) - ] - return ModelList(data=model_cards) - - def _create_logprobs( - self, - token_ids: List[int], - top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None, - num_output_top_logprobs: Optional[int] = None, - initial_text_offset: int = 0, - ) -> LogProbs: - """Create OpenAI-style logprobs.""" - logprobs = LogProbs() - last_token_len = 0 - if num_output_top_logprobs: - logprobs.top_logprobs = [] - for i, token_id in enumerate(token_ids): - step_top_logprobs = top_logprobs[i] - if step_top_logprobs is not None: - token_logprob = step_top_logprobs[token_id] - else: - token_logprob = None - token = self.tokenizer.convert_ids_to_tokens(token_id) - logprobs.tokens.append(token) - logprobs.token_logprobs.append(token_logprob) - if len(logprobs.text_offset) == 0: - logprobs.text_offset.append(initial_text_offset) - else: - logprobs.text_offset.append(logprobs.text_offset[-1] + - last_token_len) - last_token_len = len(token) - - if num_output_top_logprobs: - logprobs.top_logprobs.append({ - self.tokenizer.convert_ids_to_tokens(i): p - for i, p in step_top_logprobs.items() - } if step_top_logprobs else None) - return logprobs - - def create_error_response( - self, - message: str, - err_type: str = "BadRequestError", - status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse: - return ErrorResponse(message=message, - type=err_type, - code=status_code.value) - - async def _check_model(self, request) -> Optional[ErrorResponse]: - if request.model == self.served_model: - return - return self.create_error_response( - message=f"The model `{request.model}` does not exist.", - err_type="NotFoundError", - status_code=HTTPStatus.NOT_FOUND) - - def _validate_prompt_and_tokenize( - self, - request: Union[ChatCompletionRequest, CompletionRequest], - prompt: Optional[str] = None, - prompt_ids: Optional[List[int]] = None) -> List[int]: - if not (prompt or prompt_ids): - raise ValueError("Either prompt or prompt_ids should be provided.") - if (prompt and prompt_ids): - raise ValueError( - "Only one of prompt or prompt_ids should be provided.") - - input_ids = prompt_ids if prompt_ids is not None else self.tokenizer( - prompt).input_ids - token_num = len(input_ids) - - if request.max_tokens is None: - request.max_tokens = self.max_model_len - token_num - - if token_num + request.max_tokens > self.max_model_len: - raise ValueError( - f"This model's maximum context length is {self.max_model_len} tokens. " - f"However, you requested {request.max_tokens + token_num} tokens " - f"({token_num} in the messages, " - f"{request.max_tokens} in the completion). " - f"Please reduce the length of the messages or completion.", ) - else: - return input_ids diff --git a/vllm/logger.py b/vllm/logger.py deleted file mode 100644 index 24d4f0ec1ae0a6701c296dacd2614aa29368f80a..0000000000000000000000000000000000000000 --- a/vllm/logger.py +++ /dev/null @@ -1,56 +0,0 @@ -# Adapted from -# https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py -"""Logging configuration for vLLM.""" -import logging -import sys - -_FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" -_DATE_FORMAT = "%m-%d %H:%M:%S" - - -class NewLineFormatter(logging.Formatter): - """Adds logging prefix to newlines to align multi-line messages.""" - - def __init__(self, fmt, datefmt=None): - logging.Formatter.__init__(self, fmt, datefmt) - - def format(self, record): - msg = logging.Formatter.format(self, record) - if record.message != "": - parts = msg.split(record.message) - msg = msg.replace("\n", "\r\n" + parts[0]) - return msg - - -_root_logger = logging.getLogger("vllm") -_default_handler = None - - -def _setup_logger(): - _root_logger.setLevel(logging.DEBUG) - global _default_handler - if _default_handler is None: - _default_handler = logging.StreamHandler(sys.stdout) - _default_handler.flush = sys.stdout.flush # type: ignore - _default_handler.setLevel(logging.INFO) - _root_logger.addHandler(_default_handler) - fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT) - _default_handler.setFormatter(fmt) - # Setting this will avoid the message - # being propagated to the parent logger. - _root_logger.propagate = False - - -# The logger is initialized when the module is imported. -# This is thread-safe as the module is only imported once, -# guaranteed by the Python GIL. -_setup_logger() - - -def init_logger(name: str): - # Use the same settings as above for root logger - logger = logging.getLogger(name) - logger.setLevel(logging.DEBUG) - logger.addHandler(_default_handler) - logger.propagate = False - return logger diff --git a/vllm/lora/__init__.py b/vllm/lora/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py deleted file mode 100644 index e1aac20b038b4890cef3ca28388d1ea230ff595c..0000000000000000000000000000000000000000 --- a/vllm/lora/layers.py +++ /dev/null @@ -1,975 +0,0 @@ -# pylint: disable=unused-argument -import math -from dataclasses import dataclass -from typing import TYPE_CHECKING, List, Optional, Tuple - -import torch -import torch.nn as nn -import torch.nn.functional as F -from transformers import PretrainedConfig - -from vllm.config import LoRAConfig -from vllm.lora.punica import add_lora, add_lora_slice, bgmv -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_all_gather, - tensor_model_parallel_all_reduce, - tensor_model_parallel_gather, -) -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - RowParallelLinear, - QKVParallelLinear, - MergedColumnParallelLinear) -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.parallel_utils.utils import split_tensor_along_last_dim - -if TYPE_CHECKING: - pass - - -def _apply_lora( - x: torch.Tensor, - lora_a_stacked: torch.Tensor, - lora_b_stacked: torch.Tensor, - indices: torch.Tensor, - output: torch.Tensor, -): - """Applies lora to each input. - - This method applies all loras to each input. It uses the - indices vector to determine which lora yields the - correct output. An index of -1 means no lora should be - applied. This method adds the final lora results to the - output. - - Input shapes: - x: (batch_size, hidden_dim) - lora_a_stacked: (num_loras, lora_rank, hidden_dim) - lora_b_stacked: (num_loras, output_dim, lora_rank) - indices: (batch_size) - output: (batch_size, output_dim) - """ - org_output = output - x = x.view(-1, x.shape[-1]) - output = output.view(-1, output.shape[-1]) - indices = indices.view(-1) - add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0) - return output.view_as(org_output) - - -def _apply_lora_packed_nslice( - x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], - lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], - indices: torch.Tensor, - output: torch.Tensor, - output_slices: Tuple[int, ...], -): - """Applies lora to each input. - - This method applies all loras to each input. It uses the - indices vector to determine which lora yields the - correct output. An index of -1 means no lora should be - applied. This method adds the final lora results to the - output. - - This method is used for layers that are composed of multiple sublayers - (slices) packed together. - - Input shapes: - x: (batch_size, hidden_dim) - lora_a_stacked: 3 element tuple of (num_loras, lora_rank, hidden_dim) - lora_b_stacked: 3 element tuple of (num_loras, output_dim, lora_rank) - indices: (batch_size) - output: (batch_size, q_slice_size + 2*kv_slice_size) - output_slices: n-1 element tuple of (slice_size...), where n is number of slices - """ - org_output = output - x = x.view(-1, x.shape[-1]) - output = output.view(-1, output.shape[-1]) - indices = indices.view(-1) - offset_left = 0 - for slice_idx in range(len(output_slices)): - add_lora_slice(output, x, lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], indices, 0, 1.0, offset_left, - output_slices[slice_idx]) - offset_left += output_slices[slice_idx] - return output.view_as(org_output) - - -@dataclass -class LoRAMapping: - # Per every token in input_ids: - index_mapping: Tuple[int, ...] - # Per sampled token: - prompt_mapping: Tuple[int, ...] - - def __post_init__(self): - self.index_mapping = tuple(self.index_mapping) - self.prompt_mapping = tuple(self.prompt_mapping) - - -class BaseLayerWithLoRA(nn.Module): - - def create_lora_weights(self, max_loras: int, lora_config: LoRAConfig, - model_config: PretrainedConfig) -> None: - """Initializes lora matrices.""" - ... - - def reset_lora(self, index: int): - """Resets the lora weights at index back to 0.""" - ... - - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - ): - """Overwrites lora tensors at index.""" - ... - - def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - indices_len: List[int], - ): - """Sets the mapping indices.""" - ... - - -class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): - - def __init__(self, base_layer: VocabParallelEmbedding) -> None: - super().__init__() - self.base_layer = base_layer - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None) -> None: - - lora_vocab_start_idx = self.base_layer.org_vocab_size - weights_idx = None - if self.base_layer.vocab_end_index > lora_vocab_start_idx: - # We can start adding lora weights - weights_idx = max( - lora_vocab_start_idx - self.base_layer.vocab_start_index, 0) - self.embeddings_slice = (self.base_layer.vocab_start_index - - self.base_layer.org_vocab_size + - weights_idx, - self.base_layer.vocab_end_index - - self.base_layer.org_vocab_size) - self.embeddings_weights = self.base_layer.weight.data[weights_idx:] - self.embeddings_weights.fill_(0) - else: - self.embeddings_slice = None - self.embeddings_weights = None - - self.embeddings_tensors = torch.zeros( - ( - max_loras, - lora_config.lora_extra_vocab_size, - self.base_layer.embedding_dim, - ), - dtype=self.base_layer.weight.dtype, - device=self.base_layer.weight.device, - ) - self.lora_a_stacked = torch.zeros( - ( - max_loras, - self.base_layer.org_vocab_size + - lora_config.lora_extra_vocab_size, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ) - self.lora_b_stacked = torch.zeros( - ( - max_loras, - 1, - self.base_layer.embedding_dim, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ) - self.lora_a_stacked_2d = self.lora_a_stacked.view( - self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1], - self.lora_a_stacked.shape[2], - ) - self.indices: Optional[torch.Tensor] = None - self.indices_len: Optional[List[int]] = None - self.embeddings_indices = None - - def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 - self.embeddings_tensors[index] = 0 - - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - ): - self.reset_lora(index) - self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_( - lora_a, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if embeddings_tensor is not None: - self.embeddings_tensors[ - index, :embeddings_tensor.shape[0], :embeddings_tensor. - shape[1]].copy_(embeddings_tensor, non_blocking=True) - if self.embeddings_slice is not None: - # TODO(yard1): Optimize this copy, we don't need to copy - # everything, just the modified part - embeddings = self.embeddings_tensors.view( - self.embeddings_tensors.shape[0] * - self.embeddings_tensors.shape[1], - self.embeddings_tensors.shape[2] - )[self.embeddings_slice[0]:self.embeddings_slice[1]] - self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings) - - def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - indices_len: List[int], - ): - self.indices = base_indices - self.embeddings_indices = embeddings_indices - self.indices_len = indices_len - - def forward(self, x: torch.Tensor) -> torch.Tensor: - added_tokens_mask = x > self.base_layer.org_vocab_size - 1 - indices = self.embeddings_indices[1][:self.indices_len[3]].view_as(x) - full_lora_a_embeddings = F.embedding( - x + indices, - self.lora_a_stacked_2d, - ) - indices = self.embeddings_indices[0][:self.indices_len[3]].view_as(x) - full_output = self.base_layer.forward( - x.add_(indices * added_tokens_mask)) - - full_output_org = full_output - if full_output.ndim == 3: - full_output = full_output.view( - full_output.shape[0] * full_output.shape[1], -1) - if full_lora_a_embeddings.ndim == 3: - full_lora_a_embeddings = full_lora_a_embeddings.view( - full_lora_a_embeddings.shape[0] * - full_lora_a_embeddings.shape[1], -1) - bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked, - self.indices[:self.indices_len[0]], 0, 1.0) - return full_output.view_as(full_output_org) - - -class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA): - - def __init__(self, base_layer: ColumnParallelLinear) -> None: - super().__init__() - self.base_layer = base_layer - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None) -> None: - self.lora_a_stacked = torch.zeros( - max_loras, - 1, - lora_config.max_lora_rank, - self.base_layer.weight.shape[1], - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ) - self.lora_b_stacked = torch.zeros( - max_loras, - 1, - self.base_layer.weight.shape[0], - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ) - - self.indices: Optional[torch.Tensor] = None - self.indices_len: Optional[List[int]] = None - self.output_dim = self.lora_b_stacked.shape[1] - - def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 - - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - ): - self.reset_lora(index) - - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - - def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - indices_len: List[int], - ): - self.indices = base_indices - self.indices_len = indices_len - - def apply_weights(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.linear_method.apply_weights( - self.base_layer.linear_weights, x, bias) - _apply_lora( - x, - self.lora_a_stacked, - self.lora_b_stacked, - self.indices[:self.indices_len[0]], - output, - ) - return output - - def forward(self, input_): - """Forward of ColumnParallelLinear - - Args: - input_: Tensor whose last dimension is `input_size`. - - Returns: - - output - - bias - """ - bias = (self.base_layer.bias - if not self.base_layer.skip_bias_add else None) - - # Matrix multiply. - output_parallel = self.apply_weights(input_, bias) - if self.base_layer.gather_output: - # All-gather across the partitions. - output = tensor_model_parallel_all_gather(output_parallel) - else: - output = output_parallel - output_bias = (self.base_layer.bias - if self.base_layer.skip_bias_add else None) - return output, output_bias - - @property - def linear_weights(self): - return self.base_layer.linear_weights - - -class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): - """ColumnParallelLinear layer that is composed of 2 sublayers (slices) - packed together (eg. gate_proj + up_proj -> gate_up_proj). - - This means we have 2 LoRAs, each applied to one half of the layer. - - Both slices must have the same size. - """ - - def __init__(self, base_layer: MergedColumnParallelLinear) -> None: - super().__init__(base_layer) - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None) -> None: - n_slices = 2 - if not (len(self.base_layer.output_sizes) == n_slices - and self.base_layer.output_sizes[0] - == self.base_layer.output_sizes[1]): - raise ValueError( - "LoRAColumnParallelLinear2Slice requires 2 slices with " - "the same size.") - self.tp_size = get_tensor_model_parallel_world_size() - - self.lora_a_stacked = tuple( - torch.zeros( - max_loras, - 1, - lora_config.max_lora_rank, - self.base_layer.weight.shape[1], - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ) for _ in range(n_slices)) - self.lora_b_stacked = tuple( - torch.zeros( - max_loras, - 1, - self.base_layer.weight.shape[0] // 2, - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ) for _ in range(n_slices)) - - self.indices: Optional[torch.Tensor] = None - self.output_dim = self.lora_b_stacked[0].shape[2] - - def reset_lora(self, index: int): - self.lora_a_stacked[0][index] = 0 - self.lora_a_stacked[1][index] = 0 - self.lora_b_stacked[0][index] = 0 - self.lora_b_stacked[1][index] = 0 - - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - ): - self.reset_lora(index) - - if self.tp_size > 1: - tensor_model_parallel_rank = get_tensor_model_parallel_rank() - shard_size = self.output_dim - start_idx = tensor_model_parallel_rank * shard_size - end_idx = (tensor_model_parallel_rank + 1) * shard_size - lora_b = lora_b[0][:, - start_idx:end_idx], lora_b[1][:, - start_idx:end_idx] - - if lora_a[0] is not None: - self.lora_a_stacked[0][ - index, 0, :lora_a[0].shape[1], :lora_a[0].shape[0]].copy_( - lora_a[0].T, non_blocking=True) - self.lora_b_stacked[0][ - index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_( - lora_b[0].T, non_blocking=True) - if lora_a[1] is not None: - self.lora_a_stacked[1][ - index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_( - lora_a[1].T, non_blocking=True) - self.lora_b_stacked[1][ - index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_( - lora_b[1].T, non_blocking=True) - - def apply_weights(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.linear_method.apply_weights( - self.base_layer.linear_weights, x, bias) - _apply_lora_packed_nslice( - x, - self.lora_a_stacked, - self.lora_b_stacked, - self.indices[:self.indices_len[0]], - output, - (self.output_dim, self.output_dim), - ) - return output - - -class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): - """ColumnParallelLinear layer that is composed of 3 sublayers (slices) - packed together in qkv proj fashion - (q_proj + k_proj + v_proj -> qkv_proj). - - This means we have 3 LoRAs, each applied to one slice of the layer. - - Q slice may have different shape than K and V slices (which both have - the same shape). - """ - - def __init__(self, base_layer: QKVParallelLinear) -> None: - super().__init__(base_layer) - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None) -> None: - self.tp_size = get_tensor_model_parallel_world_size() - tp_rank = get_tensor_model_parallel_rank() - self.q_proj_shard_size = (self.base_layer.num_heads * - self.base_layer.head_size) - self.kv_proj_shard_size = (self.base_layer.num_kv_heads * - self.base_layer.head_size) - self.q_shard_id = tp_rank - self.kv_shard_id = tp_rank // self.base_layer.num_kv_head_replicas - - # q, k, v - self.lora_a_stacked = ( - torch.zeros( - max_loras, - 1, - lora_config.max_lora_rank, - self.base_layer.weight.shape[1], - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ), - torch.zeros( - max_loras, - 1, - lora_config.max_lora_rank, - self.base_layer.weight.shape[1], - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ), - torch.zeros( - max_loras, - 1, - lora_config.max_lora_rank, - self.base_layer.weight.shape[1], - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ), - ) - self.lora_b_stacked = ( - torch.zeros( - max_loras, - 1, - self.q_proj_shard_size, - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ), - torch.zeros( - max_loras, - 1, - self.kv_proj_shard_size, - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ), - torch.zeros( - max_loras, - 1, - self.kv_proj_shard_size, - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ), - ) - - self.output_slices = (self.q_proj_shard_size, self.kv_proj_shard_size, - self.kv_proj_shard_size) - self.packed_indices: Optional[torch.Tensor] = None - self.standard_indices: Optional[torch.Tensor] = None - self.indices_len: Optional[List[int]] = None - - def reset_lora(self, index: int): - self.lora_a_stacked[0][index] = 0 - self.lora_b_stacked[0][index] = 0 - self.lora_a_stacked[1][index] = 0 - self.lora_b_stacked[1][index] = 0 - self.lora_a_stacked[2][index] = 0 - self.lora_b_stacked[2][index] = 0 - - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - ): - self.reset_lora(index) - - if self.tp_size > 1: - if lora_b[0] is not None: - lora_b_q = lora_b[0][:, self.q_proj_shard_size * - self.q_shard_id:self.q_proj_shard_size * - (self.q_shard_id + 1)] - self.lora_b_stacked[0][ - index, 0, :lora_b_q.shape[1], :lora_b_q.shape[0]].copy_( - lora_b_q.T, non_blocking=True) - if lora_b[1] is not None: - lora_b_k = lora_b[1][:, self.kv_proj_shard_size * - self.kv_shard_id:self.kv_proj_shard_size * - (self.kv_shard_id + 1)] - self.lora_b_stacked[1][ - index, 0, :lora_b_k.shape[1], :lora_b_k.shape[0]].copy_( - lora_b_k.T, non_blocking=True) - if lora_b[2] is not None: - lora_b_v = lora_b[2][:, self.kv_proj_shard_size * - self.kv_shard_id:self.kv_proj_shard_size * - (self.kv_shard_id + 1)] - self.lora_b_stacked[2][ - index, 0, :lora_b_v.shape[1], :lora_b_v.shape[0]].copy_( - lora_b_v.T, non_blocking=True) - else: - if lora_b[0] is not None: - self.lora_b_stacked[0][ - index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_( - lora_b[0].T, non_blocking=True) - if lora_b[1] is not None: - self.lora_b_stacked[1][ - index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_( - lora_b[1].T, non_blocking=True) - if lora_b[2] is not None: - self.lora_b_stacked[2][ - index, 0, :lora_b[2].shape[1], :lora_b[2].shape[0]].copy_( - lora_b[2].T, non_blocking=True) - - if lora_a[0] is not None: - self.lora_a_stacked[0][ - index, 0, :lora_a[0].shape[1], :lora_a[0].shape[0]].copy_( - lora_a[0].T, non_blocking=True) - if lora_a[1] is not None: - self.lora_a_stacked[1][ - index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_( - lora_a[1].T, non_blocking=True) - if lora_a[2] is not None: - self.lora_a_stacked[2][ - index, 0, :lora_a[2].shape[1], :lora_a[2].shape[0]].copy_( - lora_a[2].T, non_blocking=True) - - def apply_weights(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.linear_method.apply_weights( - self.base_layer.linear_weights, x, bias) - _apply_lora_packed_nslice( - x, - self.lora_a_stacked, - self.lora_b_stacked, - self.indices[:self.indices_len[0]], - output, - self.output_slices, - ) - return output - - -class RowParallelLinearWithLoRA(BaseLayerWithLoRA): - - def __init__(self, base_layer: RowParallelLinear) -> None: - super().__init__() - self.base_layer = base_layer - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None) -> None: - self.lora_a_stacked = torch.zeros( - ( - max_loras, - 1, - lora_config.max_lora_rank, - self.base_layer.weight.shape[1], - ), - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ) - self.lora_b_stacked = torch.zeros( - ( - max_loras, - 1, - self.base_layer.weight.shape[0], - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ) - self.indices: Optional[torch.Tensor] = None - self.indices_len: Optional[List[int]] = None - - def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 - - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - ): - self.reset_lora(index) - if self.base_layer.tp_size > 1: - tensor_model_parallel_rank = get_tensor_model_parallel_rank() - shard_size = self.base_layer.weight.shape[1] - start_idx = tensor_model_parallel_rank * shard_size - end_idx = (tensor_model_parallel_rank + 1) * shard_size - lora_a = lora_a[start_idx:end_idx, :] - - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - - def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - indices_len: List[int], - ): - self.indices = base_indices - self.indices_len = indices_len - - def apply_weights(self, x: torch.Tensor) -> torch.Tensor: - output = self.base_layer.linear_method.apply_weights( - self.base_layer.linear_weights, x) - _apply_lora( - x, - self.lora_a_stacked, - self.lora_b_stacked, - self.indices[:self.indices_len[0]], - output, - ) - return output - - def forward(self, input_): - """Forward of RowParallelLinear - - Args: - input_: tensor whose last dimension is `input_size`. If - `input_is_parallel` is set, then the last dimension - is `input_size // tp_size`. - - Returns: - - output - - bias - """ - # Set up backprop all-reduce. - if self.base_layer.input_is_parallel: - input_parallel = input_ - else: - # TODO: simplify code below - tp_rank = get_tensor_model_parallel_rank() - splitted_input = split_tensor_along_last_dim( - input_, num_partitions=self.base_layer.tp_size) - input_parallel = splitted_input[tp_rank].contiguous() - - # Matrix multiply. - output_parallel = self.apply_weights(input_parallel) - if self.base_layer.reduce_results and self.base_layer.tp_size > 1: - output_ = tensor_model_parallel_all_reduce(output_parallel) - else: - output_ = output_parallel - - if not self.base_layer.skip_bias_add: - output = (output_ + self.base_layer.bias - if self.base_layer.bias is not None else output_) - output_bias = None - else: - output = output_ - output_bias = self.base_layer.bias - return output, output_bias - - @property - def weight(self): - return self.base_layer.weight - - -class SamplerWithLoRA(BaseLayerWithLoRA): - - def __init__( - self, - base_layer: Sampler, - hidden_size: int, - dtype: torch.dtype, - device: torch.device, - ) -> None: - super().__init__() - self.base_layer = base_layer - self.hidden_size = hidden_size - self.dtype = dtype - self.device = device - - @property - def vocab_size(self): - return self.base_layer.vocab_size - - @property - def org_vocab_size(self): - return self.base_layer.org_vocab_size - - @property - def include_gpu_probs_tensor(self): - return self.base_layer.include_gpu_probs_tensor - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - # Keep this in sync with csrc/punica/bgmv/bgmv_config.h - if 32000 < self.base_layer.vocab_size > 33024: - raise ValueError( - "When using LoRA, vocab size must be 32000 >= vocab_size <= 33024" - ) - self.lora_a_stacked = torch.zeros( - ( - max_loras, - 1, - lora_config.max_lora_rank, - self.hidden_size, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - self.lora_b_stacked = torch.zeros( - ( - max_loras, - 1, - # Pad for kernel compatibility - math.ceil(self.base_layer.vocab_size / - lora_config.lora_vocab_padding_size) * - lora_config.lora_vocab_padding_size, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - self.embeddings_tensors = torch.full( - (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size), - fill_value=float("-inf"), - dtype=self.dtype, - device=self.device, - ) - self.indices = None - self.indices_padded = None - self.indices_len = None - - def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 - self.embeddings_tensors[index] = float("-inf") - - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - ): - self.reset_lora(index) - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if embeddings_tensor is not None: - self.embeddings_tensors[ - index, :embeddings_tensor.shape[0], :embeddings_tensor. - shape[1], ] = embeddings_tensor - - def set_mapping( - self, - base_indices: torch.Tensor, - sampler_indices: torch.Tensor, - sampler_indices_padded: torch.Tensor, - embeddings_indices: torch.Tensor, - indices_len: List[int], - ): - self.indices = sampler_indices - self.indices_padded = sampler_indices_padded - self.indices_len = indices_len - - def _get_logits( - self, - hidden_states: torch.Tensor, - embedding: torch.Tensor, - embedding_bias: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - # Get the logits for the next tokens. - logits = torch.matmul(hidden_states, embedding.t()) - if embedding_bias is not None: - logits += embedding_bias - logits = tensor_model_parallel_gather(logits) - if logits is None: - return None - - lora_logits = torch.empty( - self.embeddings_tensors.shape[0] + 1, - self.embeddings_tensors.shape[1], - hidden_states.shape[0], - dtype=self.embeddings_tensors.dtype, - device=self.embeddings_tensors.device, - ) - torch.matmul(self.embeddings_tensors, - hidden_states.T, - out=lora_logits[:-1]) - lora_logits[-1] = float("-inf") - lora_logits = lora_logits.mT - lora_logits = (lora_logits.reshape( - lora_logits.shape[0] * lora_logits.shape[1], - lora_logits.shape[2], - ).index_select(0, - self.indices_padded[:self.indices_len[2]]).nan_to_num_( - nan=float("-inf"), - posinf=float("inf"), - neginf=float("-inf"))) - logits[:, - self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + - lora_logits.shape[1]] = lora_logits - - _apply_lora( - hidden_states, - self.lora_a_stacked, - self.lora_b_stacked, - self.indices[:self.indices_len[1]], - logits, - ) - - # Remove paddings in vocab (if any). - logits = logits[:, :self.base_layer.vocab_size] - - return logits - - def forward(self, *args, **kwargs): - return type(self.base_layer).forward(self, *args, **kwargs) - - -def from_layer( - layer: nn.Module, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None) -> BaseLayerWithLoRA: - supported_layer_types = { - VocabParallelEmbedding: VocabParallelEmbeddingWithLoRA, - ColumnParallelLinear: ColumnParallelLinearWithLoRA, - QKVParallelLinear: QKVParallelLinearWithLora, - MergedColumnParallelLinear: MergedColumnParallelLinearWithLoRA, - RowParallelLinear: RowParallelLinearWithLoRA, - } - for src_layer_type, lora_layer_type in supported_layer_types.items(): - if type(layer) is src_layer_type: # pylint: disable=unidiomatic-typecheck - ret = lora_layer_type(layer) - ret.create_lora_weights(max_loras, lora_config, model_config) - return ret - return layer - - -def from_layer_sampler( - layer: Sampler, - lm_head: ParallelLMHead, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, -) -> SamplerWithLoRA: - ret = SamplerWithLoRA(layer, lm_head.embedding_dim, lm_head.weight.dtype, - lm_head.weight.device) - ret.create_lora_weights(max_loras, lora_config, model_config) - return ret diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py deleted file mode 100644 index fbb228c9582d423f525fdcea7ca4409464f602e0..0000000000000000000000000000000000000000 --- a/vllm/lora/lora.py +++ /dev/null @@ -1,160 +0,0 @@ -from typing import List, Optional - -import torch -from vllm.utils import in_wsl - - -class LoRALayerWeights: - """LoRA weights for a layer composed of two low rank matrixes.""" - - def __init__( - self, - module_name: str, - rank: int, - lora_alpha: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor] = None, - scaling: Optional[float] = None, - ) -> None: - self.module_name = module_name - self.rank = rank - self.lora_alpha = lora_alpha - self.lora_a = lora_a - self.lora_b = lora_b - self.embeddings_tensor = embeddings_tensor - - if scaling is None: - self.scaling = self.lora_alpha / self.rank - else: - self.scaling = scaling - - def optimize(self) -> "LoRALayerWeights": - """Optimize the LoRA by merging the scaling into lora_b.""" - if self.scaling == 1: - return - self.lora_b *= self.scaling - self.scaling = 1 - return self - - @property - def input_dim(self) -> int: - return self.lora_a.shape[0] - - @property - def output_dim(self) -> int: - return self.lora_b.shape[1] - - @property - def is_packed(self) -> bool: - return False - - @property - def extra_vocab_size(self) -> int: - return self.embeddings_tensor.shape[ - 0] if self.embeddings_tensor is not None else 0 - - @classmethod - def create_dummy_lora_weights( - cls, - module_name: str, - input_dim: int, - output_dim: int, - rank: int, - dtype: torch.dtype, - device: torch.device, - embeddings_tensor_dim: Optional[int] = None) -> "LoRALayerWeights": - pin_memory = str(device) == "cpu" and not in_wsl() - lora_a = torch.zeros([input_dim, rank], - dtype=dtype, - device=device, - pin_memory=pin_memory) - lora_b = torch.zeros([rank, output_dim], - dtype=dtype, - device=device, - pin_memory=pin_memory) - embeddings_tensor = torch.rand( - 10, - embeddings_tensor_dim, - dtype=dtype, - device=device, - pin_memory=pin_memory) if embeddings_tensor_dim else None - return cls( - module_name, - rank=rank, - lora_alpha=1, - lora_a=lora_a, - lora_b=lora_b, - embeddings_tensor=embeddings_tensor, - ) - - -class PackedLoRALayerWeights(LoRALayerWeights): - """LoRA used for packed layers (eg. qkv_proj).""" - - def __init__( - self, - module_name: str, - rank: int, - lora_alphas: List[int], - lora_a: List[torch.Tensor], - lora_b: List[torch.Tensor], - scaling: Optional[List[float]] = None, - ) -> None: - super().__init__( - module_name=module_name, - rank=rank, - lora_alpha=0, - lora_a=lora_a, - lora_b=lora_b, - scaling=scaling, - embeddings_tensor=None, - ) - self.lora_alphas = lora_alphas - if scaling is None: - self.scaling = [ - lora_alpha / self.rank for lora_alpha in self.lora_alphas - ] - - @classmethod - def pack(cls, loras: List["LoRALayerWeights"]) -> "PackedLoRALayerWeights": - """Pack a list of LoRAs into a single LoRA. - - If LoRA is None, it signifies that the submodule does not have a LoRA. - """ - first_lora = next(lora for lora in loras if lora is not None) - for lora in loras: - if lora is None: - continue - lora.optimize() - rank = first_lora.rank - module_name = first_lora.module_name - obj = cls( - module_name, - rank, - [lora.lora_alpha if lora is not None else None for lora in loras], - [lora.lora_a if lora is not None else None for lora in loras], - [lora.lora_b if lora is not None else None for lora in loras], - scaling=[1 if lora is not None else None for lora in loras]) - return obj - - def optimize(self) -> "PackedLoRALayerWeights": - """Optimize the LoRA by merging the scaling into lora_b.""" - for i in range(len(self.lora_b)): - if self.scaling[i] == 1 or self.lora_b[i] is None: - continue - self.lora_b[i] *= self.scaling[i] - self.scaling[i] = 1 - return self - - @property - def input_dim(self) -> int: - raise NotImplementedError() - - @property - def output_dim(self) -> int: - raise NotImplementedError() - - @property - def is_packed(self) -> bool: - return True diff --git a/vllm/lora/models.py b/vllm/lora/models.py deleted file mode 100644 index 6c78c4a2c77715fe2c72bda01b37250503937110..0000000000000000000000000000000000000000 --- a/vllm/lora/models.py +++ /dev/null @@ -1,654 +0,0 @@ -import copy -import json -import logging -import math -import os -import re -from typing import (Any, Callable, Dict, Hashable, List, Optional, Tuple, Type, - Union) - -import safetensors.torch -import torch -from torch import nn - -from vllm.config import LoRAConfig -from vllm.utils import LRUCache, in_wsl - -from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping, from_layer, from_layer_sampler -from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights -from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule - -logger = logging.getLogger(__name__) - -# TODO: The mappings below should be moved to individual model classes. - -PACKED_MODULES_CFG = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], -} - -TARGET_MODULES_QKV = [ - "qkv_proj", - "o_proj", - "gate_up_proj", - "down_proj", - "embed_tokens", - "lm_head", -] - -EMBEDDING_MODULES = { - "embed_tokens": "input_embeddings", - "lm_head": "output_embeddings", -} - -EMBEDDING_PADDING_MODULES = ["lm_head"] - -_GLOBAL_LORA_ID = 0 - - -def convert_mapping( - mapping: LoRAMapping, lora_index_to_id: List[Optional[int]], - max_loras: int, vocab_size: int, extra_vocab_size: int -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, List[int]]: - """Converts LoRAMapping to index tensors. - - Args: - mapping: LoRAMapping mapping rows in a batch to LoRA ids. - lora_index_to_id: List mapping LoRA ids to LoRA indices. - max_loras: Maximum number of LoRAs. - vocab_size: Model vocab size. - extra_vocab_size: Extra vocab size each LoRA can have. - - Returns: - A tuple of tensors: - base_indices: Tensor of shape [batch_size] mapping batch rows to - LoRA indices. - sampler_indices: Tensor of shape [batch_size] mapping requests to - LoRA indices for sampler. For generation, this will be the - same as base_indicies. For prefill, this will map requests - to LoRA indices. - sampler_indices_padded: Tensor of shape [batch_size] mapping - requests to LoRA indices for sampler with padding. - Same as sampler_indicies, but -1 is replaced with - max_loras. - embeddings_indices: Tensor of shape [2, batch_size] mapping - requests to embedding indices. First row is for embeddings - added by the LoRAs, second row is for the LoRA.lora_a - embeddings. - indices_len: List of lengths of the above tensors. - """ - indices = list(mapping.index_mapping).copy() - embedding_indices = indices.copy() - lora_indices = indices.copy() - prompt_mapping = [ - lora_index_to_id.index(x) if x > 0 else -1 - for x in mapping.prompt_mapping - ] - lora_idx = None - for i in range(len(indices)): - # TODO index can be slow. optimize - lora_idx = (lora_index_to_id.index(indices[i]) - if indices[i] > 0 else -1) - embedding_indices[i] = lora_idx if indices[i] > 0 else 0 - indices[i] = i - lora_indices[i] = lora_idx - - indices = torch.tensor([indices, lora_indices, embedding_indices], - dtype=torch.long, - device="cuda") - prompt_mapping = torch.tensor(prompt_mapping, - device="cuda", - dtype=torch.long) - embeddings_indices = torch.stack([ - indices[2] * extra_vocab_size, - indices[2] * (vocab_size + extra_vocab_size) - ]) - embeddings_indices[embeddings_indices == -1] = max_loras - 1 - base_indices = indices[1] - sampler_indices = prompt_mapping - sampler_indices_padded = sampler_indices.clone() - sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 - sampler_indices_padded = ( - torch.arange( - 0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + - (sampler_indices_padded * len(sampler_indices_padded))) - indices_len = (base_indices.shape[-1], sampler_indices.shape[-1], - sampler_indices_padded.shape[-1], - embeddings_indices.shape[-1]) - - return (base_indices, sampler_indices, sampler_indices_padded, - embeddings_indices, indices_len) - - -def get_lora_id(): - global _GLOBAL_LORA_ID - _GLOBAL_LORA_ID += 1 - return _GLOBAL_LORA_ID - - -class LoRAModel: - """A LoRA fine-tuned model.""" - - def __init__( - self, - lora_model_id: int, - rank: int, - loras: Dict[str, LoRALayerWeights], - ) -> None: - self.id = lora_model_id - assert (lora_model_id > - 0), f"a valid lora id should be greater than 0, got {self.id}" - self.rank = rank - self.loras: Dict[str, LoRALayerWeights] = loras - - @property - def extra_vocab_size(self) -> int: - return max(lora.extra_vocab_size - for lora in self.loras.values()) if self.loras else 0 - - def get_lora(self, module_name: str) -> Optional[LoRALayerWeights]: - """Get LoRA for a given module by name""" - return self.loras.get(module_name, None) - - # (yard1): TODO see if we can derive target_embedding_padding automatically - @classmethod - def from_lora_tensors( - cls, - lora_model_id: int, - rank: int, - lora_alpha: int, - tensors: Dict[str, torch.Tensor], - device: str = "cuda", - dtype: Optional[torch.dtype] = None, - embeddings: Optional[Dict[str, torch.Tensor]] = None, - target_embedding_padding: Optional[int] = None, - ) -> "LoRAModel": - """Create a LoRAModel from a dictionary of tensors.""" - pin_memory = str(device) == "cpu" and not in_wsl() - loras: Dict[str, LoRALayerWeights] = {} - for tensor_name, tensor in tensors.items(): - module_name, is_lora_a = parse_fine_tuned_lora_name(tensor_name) - if module_name not in loras: - lora_embeddings_tensor = None - if embeddings: - embeddings_module = next( - (k for k in EMBEDDING_MODULES if k in module_name), - None) - if embeddings_module: - lora_embeddings_tensor = embeddings[ - EMBEDDING_MODULES[embeddings_module]].to( - device=device, dtype=dtype) - if pin_memory: - lora_embeddings_tensor = ( - lora_embeddings_tensor.pin_memory()) - loras[module_name] = LoRALayerWeights(module_name, rank, - lora_alpha, None, None, - lora_embeddings_tensor) - if is_lora_a: - loras[module_name].lora_a = tensor.to(device=device, - dtype=dtype).t() - if pin_memory: - loras[module_name].lora_a = loras[ - module_name].lora_a.pin_memory() - else: - loras[module_name].lora_b = tensor.to(device=device, - dtype=dtype).t() - if any(name in module_name - for name in EMBEDDING_PADDING_MODULES - ) and target_embedding_padding is not None: - lora_b = loras[module_name].lora_b - assert target_embedding_padding >= lora_b.shape[1] - addition = target_embedding_padding - lora_b.shape[1] - loras[module_name].lora_b = torch.nn.functional.pad( - lora_b, (0, addition)) - if pin_memory: - loras[module_name].lora_b = loras[ - module_name].lora_b.pin_memory() - - for lora in loras.values(): - lora.optimize() - return cls(lora_model_id, rank, loras) - - @classmethod - def from_local_checkpoint( - cls, - lora_dir: str, - lora_model_id: Optional[int] = None, - device: str = "cuda", - dtype: Optional[torch.dtype] = None, - target_embedding_padding: Optional[int] = None) -> "LoRAModel": - """Create a LoRAModel from a local checkpoint.""" - lora_config_path = os.path.join(lora_dir, "adapter_config.json") - lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors") - lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin") - new_embeddings_tensor_path = os.path.join( - lora_dir, "new_embeddings.safetensors") - new_embeddings_bin_file_path = os.path.join(lora_dir, - "new_embeddings.bin") - if os.path.isfile(lora_tensor_path): - tensors = safetensors.torch.load_file(lora_tensor_path) - elif os.path.isfile(lora_bin_file_path): - tensors = torch.load(lora_bin_file_path) - else: - raise ValueError(f"{lora_dir} doesn't contain tensors") - - embeddings = None - if os.path.isfile(new_embeddings_tensor_path): - embeddings = safetensors.torch.load_file( - new_embeddings_tensor_path) - elif os.path.isfile(new_embeddings_bin_file_path): - embeddings = torch.load(new_embeddings_bin_file_path) - - with open(lora_config_path) as f: - config = json.load(f) - rank = config["r"] - lora_alpha = config["lora_alpha"] - return cls.from_lora_tensors( - lora_model_id=get_lora_id() - if lora_model_id is None else lora_model_id, - rank=rank, - lora_alpha=lora_alpha, - tensors=tensors, - device=device, - dtype=dtype, - embeddings=embeddings, - target_embedding_padding=target_embedding_padding, - ) - - -class LoRAModelManager: - """A manager that manages multiple LoRA-fine-tuned models.""" - - def __init__( - self, - model: nn.Module, - max_num_seqs: int, - max_num_batched_tokens: int, - vocab_size: int, - lora_config: LoRAConfig, - lora_target_modules: Union[str, List[str]] = TARGET_MODULES_QKV, - packed_modules_mapping: Dict[str, List[str]] = PACKED_MODULES_CFG, - ): - """Create a LoRAModelManager and adapter for a given model. - - Args: - model: the model to be adapted. - max_num_seqs: the maximum number of sequences model can run in a - single batch. - max_num_batched_tokens: the maximum number of tokens model can run - in a single batch. - vocab_size: the vocab size of the model. - lora_config: the LoRA configuration. - lora_target_modules: the target modules patterns to be adapted. - Support both single module name and a list of module names. - packed_modules_mapping: the mapping for packed modules. vLLM - packs some modules into one module, e.g., qkv_proj - is packed of q_proj, k_proj, and v_proj. These modules - have a single layer in the original model, but they are split - into multiple layers in the adapted model. - """ - self.lora_config = lora_config - self.max_num_seqs = max_num_seqs - assert self.capacity >= self.lora_slots - self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8 - self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots - self.vocab_size = vocab_size - self.base_indices = torch.empty(self.max_num_batched_tokens, - dtype=torch.long, - device="cuda") - self.sampler_indices = torch.empty(self.max_num_batched_tokens, - dtype=torch.long, - device="cuda") - self.sampler_indices_padded = torch.empty(self.max_num_batched_tokens, - dtype=torch.long, - device="cuda") - self.embeddings_indices = torch.empty(2, - self.max_num_batched_tokens, - dtype=torch.long, - device="cuda") - self.offsets = [] - # 4 is the number of indicies tensors defined above - # base_indices, sampler_indices, sampler_indices_padded, - # embeddings_indices - self.indices_len = [None] * 4 - - self.model: nn.Module = model - self.lora_target_modules: List[str] = ([ - lora_target_modules - ] if isinstance(lora_target_modules, str) else lora_target_modules) - self.lora_target_modules = copy.deepcopy(lora_target_modules) - self.packed_modules_mapping = copy.deepcopy(packed_modules_mapping) - self.packed_modules: Dict[str, List[str]] = {} - self.modules: Dict[str, "BaseLayerWithLoRA"] = {} - self._registered_loras: Dict[int, LoRAModel] = {} - # Dict instead of a Set for compatibility with LRUCache. - self._active_loras: Dict[int, None] = {} - self._last_mapping = None - self._create_lora_modules() - self.model.lora_manager = self - - @property - def capacity(self) -> int: - return self.lora_config.max_cpu_loras - - @property - def lora_slots(self) -> int: - return self.lora_config.max_loras - - def __len__(self) -> int: - return len(self._registered_loras) - - def activate_lora( - self, - lora_id: int, - ) -> bool: - """Move LoRA into a GPU buffer to be used in the forward pass.""" - if lora_id in self._active_loras: - return False - first_free_slot = next( - ((i, lora_id) for i, lora_id in enumerate(self.lora_index_to_id) - if lora_id is None), None) - if first_free_slot is None: - raise ValueError("No free lora slots") - index, _ = first_free_slot - self._active_loras[lora_id] = None - lora_model = self._registered_loras[lora_id] - logger.debug( - f"Activating LoRA. int id: {lora_model.id}, slot index: {index}") - self.lora_index_to_id[index] = lora_model.id - for module_name, module in self.modules.items(): - module_lora = lora_model.get_lora(module_name) - if module_lora: - module_lora.optimize() - module.set_lora(index, module_lora.lora_a, module_lora.lora_b, - module_lora.embeddings_tensor) - else: - module.reset_lora(index) - return True - - def _deactivate_lora(self, lora_id: int): - try: - index = self.lora_index_to_id.index(lora_id) - self.lora_index_to_id[index] = None - except ValueError: - pass - - def deactivate_lora(self, lora_id: int) -> bool: - """Remove a LoRA from a GPU buffer.""" - if lora_id in self._active_loras: - self._deactivate_lora(lora_id) - self._active_loras.pop(lora_id) - return True - return False - - def _add_lora(self, lora: LoRAModel) -> bool: - self._create_merged_loras_inplace(lora) - self._registered_loras[lora.id] = lora - - def add_lora(self, lora: LoRAModel) -> bool: - """Add a LoRAModel to the manager CPU cache.""" - if lora.id not in self._registered_loras: - if len(self._registered_loras) >= self.capacity: - raise RuntimeError("No free LoRA slots.") - self._add_lora(lora) - return True - return False - - def remove_lora(self, lora_id: int) -> bool: - """Remove a LoRAModel from the manager CPU cache.""" - # TODO: should we check active lora? - self.deactivate_lora(lora_id) - return bool(self._registered_loras.pop(lora_id, None)) - - # TODO see if this can be vectorized - def _set_lora_mapping(self, mapping: LoRAMapping) -> None: - (base_indices, sampler_indices, sampler_indices_padded, - embeddings_indices, - indices_len) = convert_mapping(mapping, self.lora_index_to_id, - self.lora_slots + 1, self.vocab_size, - self.lora_config.lora_extra_vocab_size) - self.base_indices[:base_indices.shape[0]].copy_(base_indices) - self.sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices) - self.sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_( - sampler_indices_padded) - self.embeddings_indices[:embeddings_indices. - shape[0], :embeddings_indices.shape[1]].copy_( - embeddings_indices) - # Maintain the reference - self.indices_len[:] = indices_len - - def set_lora_mapping(self, lora_mapping: LoRAMapping) -> None: - if self._last_mapping != lora_mapping: - self._set_lora_mapping(lora_mapping) - self._last_mapping = lora_mapping - - def list_loras(self) -> Dict[int, LoRAModel]: - """List all registered LoRAModels.""" - return dict(self._registered_loras) - - def get_lora(self, lora_id: int) -> Optional[LoRAModel]: - return self._registered_loras.get(lora_id, None) - - def remove_all_loras(self) -> bool: - """Remove all LoRAModels from the manager.""" - self._registered_loras.clear() - self.lora_index_to_id = [None] * self.lora_slots - self._active_loras.clear() - - def _create_lora_modules(self): - for module_name, module in self.model.named_modules(): - if not self._match_target_modules(module_name): - continue - - new_module = replace_submodule( - self.model, module_name, - from_layer(module, self.lora_slots, self.lora_config, - self.model.config)) - # (yard1): TODO make this more robust - if "lm_head" in module_name: - sampler_module = self.model.get_submodule("sampler") - new_module = replace_submodule( - self.model, "sampler", - from_layer_sampler(sampler_module, module, self.lora_slots, - self.lora_config, self.model.config)) - self.register_module(module_name, new_module) - self._register_packed_modules(module_name) - new_module.set_mapping(self.base_indices, self.sampler_indices, - self.sampler_indices_padded, - self.embeddings_indices, self.indices_len) - - def register_module(self, module_name: str, module: "BaseLayerWithLoRA"): - assert isinstance(module, BaseLayerWithLoRA) - self.modules[module_name] = module - - def create_dummy_lora(self, lora_id: int, rank: int) -> LoRAModel: - """Create zero-initialized LoRAModel for warmup.""" - model = LoRAModel(lora_id, rank, {}) - for module_name, module in self.model.named_modules(): - if not self._match_target_modules(module_name) or not isinstance( - module, BaseLayerWithLoRA): - continue - parts = module_name.split(".") - if module_name not in self.packed_modules: - if parts[-1] in EMBEDDING_MODULES: - input_dim = (module.base_layer.org_vocab_size + - self.lora_config.lora_extra_vocab_size if - hasattr(module.base_layer, "org_vocab_size") - else module.base_layer.weight.shape[1]) - output_dim = module.base_layer.embedding_dim if hasattr( - module.base_layer, - "embedding_dim") else module.base_layer.weight.shape[0] - embeddings_tensor_dim = (module.base_layer.embedding_dim if - hasattr(module.base_layer, - "embedding_dim") else - module.base_layer.weight.shape[1]) - lora = LoRALayerWeights.create_dummy_lora_weights( - module_name, - input_dim, - output_dim, - rank, - module.lora_a_stacked.dtype, - "cpu", - embeddings_tensor_dim=embeddings_tensor_dim) - else: - lora = LoRALayerWeights.create_dummy_lora_weights( - module_name, - module.lora_a_stacked.shape[-1], - module.lora_b_stacked.shape[-2], - rank, - module.lora_a_stacked.dtype, - "cpu", - ) - lora.optimize() - else: - parts = module_name.split(".") - replacements = self.packed_modules_mapping[parts[-1]] - subloras = [] - for i, r in enumerate(replacements): - lora = LoRALayerWeights.create_dummy_lora_weights( - module_name + "." + r, - module.lora_a_stacked[i].shape[-1], - module.lora_b_stacked[i].shape[-2], - rank, - module.lora_a_stacked[i].dtype, - "cpu", - ) - lora.optimize() - subloras.append(lora) - lora = PackedLoRALayerWeights.pack(subloras) - model.loras[module_name] = lora - return model - - def _match_target_modules(self, module_name: str): - return any( - re.match( - r".*\.{target_module}$".format(target_module=target_module), - module_name) or target_module == module_name - for target_module in self.lora_target_modules) - - def _register_packed_modules(self, module_full_name: str) -> None: - parts = module_full_name.split(".") - module_name = parts[-1] - replacements = self.packed_modules_mapping.get(module_name) - if not replacements: - return - prefix = ".".join(parts[:-1]) - self.packed_modules[module_full_name] = [ - prefix + "." + r if prefix else r for r in replacements - ] - - def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None: - for module_name, new_module_names in self.packed_modules.items(): - replacement_loras = [] - has_replacement = False - for r in new_module_names: - lora = lora_model.get_lora(r) - replacement_loras.append(lora) - if lora: - has_replacement = True - if not has_replacement: - continue - for i in range(len(replacement_loras)): - if replacement_loras[i]: - continue - replacement_loras[i] = None - lora_model.loras[module_name] = PackedLoRALayerWeights.pack( - replacement_loras) - - -class LoRALRUCache(LRUCache): - - def __init__(self, capacity: int, deactivate_lora_fn: Callable[[Hashable], - None]): - super().__init__(capacity) - self.deactivate_lora_fn = deactivate_lora_fn - - def _on_remove(self, key: Hashable, value: Any): - logger.debug(f"Removing LoRA. int id: {key}") - self.deactivate_lora_fn(key) - return super()._on_remove(key, value) - - -class LRUCacheLoRAModelManager(LoRAModelManager): - """A model manager that manages multiple LoRAs with LRU cache.""" - - def __init__( - self, - model: nn.Module, - max_num_seqs: int, - max_num_batched_tokens: int, - vocab_size: int, - lora_config: LoRAConfig, - lora_target_modules: Union[str, List[str]] = TARGET_MODULES_QKV, - packed_modules_mapping: Dict[str, List[str]] = PACKED_MODULES_CFG, - ): - super().__init__(model, max_num_seqs, max_num_batched_tokens, - vocab_size, lora_config, lora_target_modules, - packed_modules_mapping) - self._registered_loras: LoRALRUCache = LoRALRUCache( - self.capacity, self.deactivate_lora) - self._active_loras: LoRALRUCache = LoRALRUCache( - self.lora_slots, self._deactivate_lora) - - def list_loras(self) -> Dict[int, LoRAModel]: - """List all registered LoRAModels.""" - return dict(self._registered_loras.cache) - - def add_lora(self, lora: LoRAModel) -> bool: - """Add a LoRAModel to the manager.""" - if lora.id not in self._registered_loras: - self._add_lora(lora) - was_added = True - else: - # We always touch to update the LRU cache order - self._registered_loras.touch(lora.id) - was_added = False - return was_added - - def activate_lora( - self, - lora_id: int, - ) -> bool: - if lora_id not in self._active_loras and len( - self._active_loras) >= self.lora_slots: - self._active_loras.remove_oldest() - result = super().activate_lora(lora_id) - # We always touch to update the LRU cache order - self._active_loras.touch(lora_id) - return result - - def remove_oldest_lora(self) -> bool: - if len(self._registered_loras) > 0: - self._registered_loras.remove_oldest() - return True - return False - - -def create_lora_manager( - model: nn.Module, - max_num_seqs: int, - max_num_batched_tokens: int, - vocab_size: int, - lora_config: LoRAConfig, - target_modules: Union[str, List[str]] = TARGET_MODULES_QKV, - lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager, - **kwargs) -> LoRAModelManager: - """Create a LoRA adapter for a given model.""" - if not getattr(model, "supports_lora", False): - raise ValueError(f"Model {type(model)} is not supported for LoRA.") - lora_manager = lora_manager_cls( - model=model, - max_num_seqs=max_num_seqs, - max_num_batched_tokens=max_num_batched_tokens, - vocab_size=vocab_size, - lora_config=lora_config, - lora_target_modules=target_modules, - **kwargs) - return lora_manager diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py deleted file mode 100644 index bcb73ccc19b0e04f854c90727fc6e94198372df0..0000000000000000000000000000000000000000 --- a/vllm/lora/punica.py +++ /dev/null @@ -1,176 +0,0 @@ -# Based on code from https://github.com/punica-ai/punica - -from typing import Optional - -import torch - -import_exc = None - -try: - import vllm._punica_C as punica_kernels -except ImportError as e: - import_exc = e - -if import_exc is None: - - def bgmv( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - ): - """ - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - - Args: - y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. - x: Shape: `[B, H1]`. Input vectors. - w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight - matrices. - indicies: Shape: `[B]`. Indices of the weight matrices. - layer_idx: Layer index of the weight matrices. - scale: Scaling factor. - """ - punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale) - - def add_lora(y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - *, - buffer: Optional[torch.Tensor] = None): - """ - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - - Args: - y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. - x: Shape: `[B, H1]`. Input vectors. - wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed - LoRA A matrices. - wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed - LoRA B matrices. - indicies: Shape: `[B]`. Indices of the LoRA weights. - layer_idx: Layer index of LoRA weights. - scale: Scaling factor. - buffer: Optional. Shape: `[B, R]`. Temporary buffer. - """ - r = wb_t_all.size(-1) - if buffer is None: - # We set the buffer to be float32 by default to avoid - # numerical innacuracies that would otherwise happen - # due to downcasting. - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) - punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, - 1.0) - punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, - scale) - - def add_lora_slice(y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - y_offset: int, - y_slice_size: int, - *, - buffer: Optional[torch.Tensor] = None): - """ - Same as `add_lora` but you can operate on slices of y. - Pass whole y, define y_offset and y_slice_size. - - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - - Args: - y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. - x: Shape: `[B, H1]`. Input vectors. - wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed - LoRA A matrices. - wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed - LoRA B matrices. - indicies: Shape: `[B]`. Indices of the LoRA weights. - layer_idx: Layer index of LoRA weights. - scale: Scaling factor. - y_offset: Offset to apply to the starting column of y. - y_slice_size: Size of the y column slice. - """ - r = wb_t_all.size(-1) - if buffer is None: - # We set the buffer to be float32 by default to avoid - # numerical inaccuracies that would otherwise happen - # due to downcasting. - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) - punica_kernels.dispatch_bgmv_low_level( - buffer, - x, - wa_t_all, - indicies, - layer_idx, - 1.0, - x.size(1), - buffer.size(1), - 0, - ) - punica_kernels.dispatch_bgmv_low_level( - y, - buffer, - wb_t_all, - indicies, - layer_idx, - scale, - buffer.size(1), - y_slice_size, - y_offset, - ) - -else: - - def _raise_exc( - *args, # pylint: disable=unused-argument - **kwargs # pylint: disable=unused-argument - ): - if torch.cuda.get_device_capability() < (8, 0): - raise ImportError("punica LoRA kernels require compute " - "capability>=8.0") from import_exc - else: - raise ImportError( - "punica LoRA kernels could not be imported. If you built vLLM " - "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var " - "was set.") from import_exc - - bgmv = _raise_exc - add_lora = _raise_exc - add_lora_slice = _raise_exc - -__all__ = [ - "bgmv", - "add_lora", - "add_lora_slice", -] diff --git a/vllm/lora/request.py b/vllm/lora/request.py deleted file mode 100644 index bbbf4880ab81bfde4801fb0f5e3ac3f0505bfbc6..0000000000000000000000000000000000000000 --- a/vllm/lora/request.py +++ /dev/null @@ -1,32 +0,0 @@ -from dataclasses import dataclass - - -@dataclass -class LoRARequest: - """ - Request for a LoRA adapter. - - Note that this class should be be used internally. For online - serving, it is recommended to not allow users to use this class but - instead provide another layer of abstraction to prevent users from - accessing unauthorized LoRA adapters. - - lora_int_id must be globally unique for a given adapter. - This is currently not enforced in vLLM. - """ - - lora_name: str - lora_int_id: int - lora_local_path: str - - def __post_init__(self): - if self.lora_int_id < 1: - raise ValueError( - f"lora_int_id must be > 0, got {self.lora_int_id}") - - def __eq__(self, value: object) -> bool: - return isinstance( - value, LoRARequest) and self.lora_int_id == value.lora_int_id - - def __hash__(self) -> int: - return self.lora_int_id diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py deleted file mode 100644 index f67a3812fb046255ed833cc99aba38fb91697d9b..0000000000000000000000000000000000000000 --- a/vllm/lora/utils.py +++ /dev/null @@ -1,39 +0,0 @@ -import logging -from typing import Tuple - -from torch import nn - -logger = logging.getLogger(__name__) - - -def replace_submodule(model: nn.Module, module_name: str, - new_module: nn.Module) -> nn.Module: - """Replace a submodule in a model with a new module.""" - parent = model.get_submodule(".".join(module_name.split(".")[:-1])) - target_name = module_name.split(".")[-1] - setattr(parent, target_name, new_module) - return new_module - - -def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]: - """Parse the name of lora weights. - - args: - name: the name of the fine-tuned LoRA, e.g. - base_model.model.dense1.weight - return: - Tuple(module_name, is_lora_a): - module_name: the name of the module, e.g. model.dense1, - is_lora_a whether the tensor is lora_a or lora_b. - """ - parts = name.split(".") - assert parts[0] == "base_model" - assert parts[1] == "model" - if parts[-1] == "weight": - assert parts[-2] == "lora_A" or parts[-2] == "lora_B" - return ".".join(parts[2:-2]), parts[-2] == "lora_A" - - if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B": - return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A" - - raise ValueError(f"{name} is unsupported format") diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py deleted file mode 100644 index a507c08588dad73bccb2e8772b53ed55a044b617..0000000000000000000000000000000000000000 --- a/vllm/lora/worker_manager.py +++ /dev/null @@ -1,237 +0,0 @@ -import logging -from abc import ABC, abstractmethod, abstractproperty -from typing import Any, List, Optional, Set, Type, Union - -import torch - -from vllm.lora.models import (TARGET_MODULES_QKV, LoRAModel, LoRAModelManager, - LRUCacheLoRAModelManager, create_lora_manager) -from vllm.lora.request import LoRARequest -from vllm.lora.layers import LoRAMapping -from vllm.config import LoRAConfig - -logger = logging.getLogger(__name__) - - -class WorkerLoRAManager(ABC): - """Abstract class for managing LoRA models on the worker side.""" - - def __init__(self, max_num_seqs: int, max_num_batched_tokens: int, - vocab_size: int, lora_config: LoRAConfig, - device: torch.device): - self.max_num_seqs = max_num_seqs - self.max_num_batched_tokens = max_num_batched_tokens - self.vocab_size = vocab_size - self.device = device - self.lora_config = lora_config - - @abstractproperty - def is_enabled(self) -> bool: - ... - - @abstractmethod - def create_lora_manager( - self, - model: torch.nn.Module, - target_modules: Union[str, List[str]] = TARGET_MODULES_QKV, - ) -> Any: - ... - - @abstractmethod - def set_active_loras(self, lora_requests: List[LoRARequest], - lora_mapping: LoRAMapping) -> None: - ... - - @abstractmethod - def add_lora(self, lora_request: LoRARequest) -> bool: - ... - - @abstractmethod - def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool: - ... - - @abstractmethod - def remove_lora(self, lora_id: int) -> bool: - ... - - @abstractmethod - def remove_all_loras(self) -> bool: - ... - - @abstractmethod - def list_loras(self) -> Set[int]: - ... - - -class WorkerLoRAManager(WorkerLoRAManager): - """WorkerLoRAManager that manages LoRA models on the worker side. - - Every request, the requested LoRAs will be loaded (unless they are already - loaded), and every other LoRA will be unloaded.""" - - _lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager - - def __init__( - self, - max_num_seqs: int, - max_num_batched_tokens: int, - vocab_size: int, - lora_config: LoRAConfig, - device: torch.device, - lora_model_cls: Type[LoRAModel] = LoRAModel, - ): - self._lora_manager: Optional[LoRAModelManager] = None - self._lora_model_cls = lora_model_cls - super().__init__(max_num_seqs, max_num_batched_tokens, vocab_size, - lora_config, device) - - @property - def is_enabled(self) -> bool: - return True - - def create_lora_manager( - self, - model: torch.nn.Module, - target_modules: Union[str, List[str]] = TARGET_MODULES_QKV, - ) -> Any: - lora_manager = create_lora_manager( - model, - max_num_seqs=self.max_num_seqs, - max_num_batched_tokens=self.max_num_batched_tokens, - target_modules=target_modules, - vocab_size=self.vocab_size, - lora_config=self.lora_config, - lora_manager_cls=self._lora_manager_cls, - ) - self._lora_manager: LoRAModelManager = lora_manager - return lora_manager.model - - def set_active_loras(self, lora_requests: List[LoRARequest], - lora_mapping: LoRAMapping) -> None: - self._apply_loras(lora_requests) - self._lora_manager.set_lora_mapping(lora_mapping) - - def _apply_loras(self, lora_requests: List[LoRARequest]) -> None: - loras_that_exist = self.list_loras() - loras_map = { - lora_request.lora_int_id: lora_request - for lora_request in lora_requests if lora_request - } - if len(loras_map) > self._lora_manager.lora_slots: - raise RuntimeError( - f"Number of requested LoRAs ({len(loras_map)}) is greater " - "than the number of GPU LoRA slots " - f"({self._lora_manager.lora_slots}).") - - new_loras = set(loras_map) - loras_to_add = new_loras - loras_that_exist - loras_to_remove = loras_that_exist - new_loras - - for lora_id in loras_to_remove: - self.remove_lora(lora_id) - - for lora_id in loras_to_add: - self.add_lora(loras_map[lora_id]) - - def _load_lora(self, lora_request: LoRARequest) -> LoRAModel: - try: - lora = self._lora_model_cls.from_local_checkpoint( - lora_request.lora_local_path, - lora_model_id=lora_request.lora_int_id, - device="cpu", - dtype=self.lora_config.lora_dtype, - target_embedding_padding=self.vocab_size + - self.lora_config.lora_extra_vocab_size, - ) - except Exception as e: - raise RuntimeError( - f"Loading lora {lora_request.lora_local_path} failed") from e - if lora.rank > self.lora_config.max_lora_rank: - raise ValueError( - f"LoRA rank {lora.rank} is greater than max_lora_rank " - f"{self.lora_config.max_lora_rank}.") - if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size: - raise ValueError( - f"LoRA added vocab size {lora.extra_vocab_size} is greater than " - f"lora_extra_vocab_size {self.lora_config.lora_extra_vocab_size}." - ) - return lora - - def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool: - if lora_request.lora_int_id in self.list_loras(): - return False - return self._lora_manager.add_lora( - self._lora_manager.create_dummy_lora(lora_request.lora_int_id, - rank)) - - def add_lora(self, lora_request: LoRARequest) -> bool: - if lora_request.lora_int_id in self.list_loras(): - return False - lora = self._load_lora(lora_request) - loaded = self._lora_manager.add_lora(lora) - self._lora_manager.activate_lora(lora.id) - return loaded - - def remove_lora(self, lora_id: int) -> bool: - return self._lora_manager.remove_lora(lora_id) - - def remove_all_loras(self) -> bool: - self._lora_manager.remove_all_loras() - - def list_loras(self) -> Set[int]: - return set(self._lora_manager.list_loras()) - - -class LRUCacheWorkerLoRAManager(WorkerLoRAManager): - """WorkerLoRAManager that manages LoRA models on the worker side. - - Uses an LRU Cache. Every request, the requested LoRAs will be loaded - (unless they are already loaded) and least recently used LoRAs will - be unloaded if the cache is above capacity.""" - - _lora_manager_cls: Type[ - LRUCacheLoRAModelManager] = LRUCacheLoRAModelManager - - def create_lora_manager( - self, - model: torch.nn.Module, - target_modules: Union[str, List[str]] = TARGET_MODULES_QKV, - ) -> Any: - lora_manager = create_lora_manager( - model, - target_modules=target_modules, - lora_manager_cls=self._lora_manager_cls, - max_num_seqs=self.max_num_seqs, - vocab_size=self.vocab_size, - lora_config=self.lora_config, - max_num_batched_tokens=self.max_num_batched_tokens, - ) - self._lora_manager: LRUCacheLoRAModelManager = lora_manager - return lora_manager.model - - def _apply_loras(self, lora_requests: List[LoRARequest]) -> None: - loras_map = { - lora_request.lora_int_id: lora_request - for lora_request in lora_requests if lora_request - } - if len(loras_map) > self._lora_manager.lora_slots: - raise RuntimeError( - f"Number of requested LoRAs ({len(loras_map)}) is greater " - "than the number of GPU LoRA slots " - f"({self._lora_manager.lora_slots}).") - for lora in loras_map.values(): - self.add_lora(lora) - - def add_lora(self, lora_request: LoRARequest) -> bool: - if lora_request.lora_int_id not in self.list_loras(): - # Remove before we load the new lora to save memory - if len(self._lora_manager) + 1 > self._lora_manager.capacity: - self._lora_manager.remove_oldest_lora() - lora = self._load_lora(lora_request) - loaded = self._lora_manager.add_lora(lora) - else: - # If the lora is already loaded, just touch it to - # update its position in the caches - loaded = self._lora_manager.get_lora(lora_request.lora_int_id) - self._lora_manager.activate_lora(lora_request.lora_int_id) - return loaded diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py deleted file mode 100644 index 0d5b2004ad7cb3668406b4d791e9dd519a523800..0000000000000000000000000000000000000000 --- a/vllm/model_executor/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.model_loader import get_model -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.utils import set_random_seed - -__all__ = [ - "InputMetadata", - "get_model", - "SamplingMetadata", - "set_random_seed", -] diff --git a/vllm/model_executor/input_metadata.py b/vllm/model_executor/input_metadata.py deleted file mode 100644 index f0a88ac8e27f89569d1a344ed289cdf4cd8548d3..0000000000000000000000000000000000000000 --- a/vllm/model_executor/input_metadata.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import Optional - -import torch - - -class InputMetadata: - """Metadata for input sequences. Used in PagedAttention. - - Args: - prompt_lens: Lengths of prompts. - slot_mapping: The address to write the new KV to of each token. - max_context_len: The maximum context length. - context_lens: the length of attention context for each sequence. - block_tables: The block tables. (Seq id -> list of physical block) - kv_cache_dtype: Data type to store kv cache. - """ - - def __init__( - self, - is_prompt: bool, - slot_mapping: torch.Tensor, - prompt_lens: Optional[torch.Tensor], - max_seq_len: Optional[int], - start_loc: Optional[torch.Tensor], - max_context_len: Optional[int], - context_lens: Optional[torch.Tensor], - block_tables: Optional[torch.Tensor], - use_cuda_graph: bool, - kv_cache_dtype: str, - ) -> None: - self.is_prompt = is_prompt - self.prompt_lens = prompt_lens - self.max_seq_len = max_seq_len - self.start_loc = start_loc - self.max_context_len = max_context_len - self.slot_mapping = slot_mapping - self.context_lens = context_lens - self.block_tables = block_tables - self.use_cuda_graph = use_cuda_graph - self.kv_cache_dtype = kv_cache_dtype - - # Set during the execution of the first attention op. - # FIXME(woosuk): This is a hack. - self.attn_bias = None - - def __repr__(self) -> str: - return ("InputMetadata(" - f"is_prompt={self.is_prompt}, " - f"max_context_len={self.max_context_len}, " - f"slot_mapping={self.slot_mapping}, " - f"context_lens={self.context_lens}, " - f"block_tables={self.block_tables}, " - f"use_cuda_graph={self.use_cuda_graph}, " - f"kv_cache_dtype={self.kv_cache_dtype})") diff --git a/vllm/model_executor/layers/__init__.py b/vllm/model_executor/layers/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py deleted file mode 100644 index 1af120d13cd4b7e86bbdd45f3c2c9e3f0ae92e4f..0000000000000000000000000000000000000000 --- a/vllm/model_executor/layers/activation.py +++ /dev/null @@ -1,141 +0,0 @@ -"""Custom activation functions.""" -import math -from typing import Optional - -import torch -import torch.nn as nn -import torch.nn.functional as F - -from vllm._C import ops -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.parallel_utils.utils import divide -from vllm.model_executor.utils import set_weight_attrs - - -class SiluAndMul(nn.Module): - """An activation function for SwiGLU. - - The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2. - - Shapes: - x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d) - return: (batch_size, seq_len, d) or (num_tokens, d) - """ - - def _forward(self, x: torch.Tensor) -> torch.Tensor: - """PyTorch-native implementation equivalent to forward().""" - d = x.shape[-1] // 2 - return F.silu(x[..., :d]) * x[..., d:] - - def forward(self, x: torch.Tensor) -> torch.Tensor: - d = x.shape[-1] // 2 - output_shape = (x.shape[:-1] + (d, )) - out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - ops.silu_and_mul(out, x) - return out - - -class NewGELU(nn.Module): - - def _forward(self, x: torch.Tensor) -> torch.Tensor: - """PyTorch-native implementation equivalent to forward().""" - c = math.sqrt(2.0 / math.pi) - return 0.5 * x * (1.0 + torch.tanh(c * - (x + 0.044715 * torch.pow(x, 3.0)))) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - out = torch.empty_like(x) - ops.gelu_new(out, x) - return out - - -class FastGELU(nn.Module): - - def _forward(self, x: torch.Tensor) -> torch.Tensor: - """PyTorch-native implementation equivalent to forward().""" - return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * - (1.0 + 0.044715 * x * x))) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - out = torch.empty_like(x) - ops.gelu_fast(out, x) - return out - - -class ScaledActivation(nn.Module): - """An activation function with post-scale parameters. - - This is used for some quantization methods like AWQ. - """ - - def __init__( - self, - act_module: nn.Module, - intermediate_size: int, - input_is_parallel: bool = True, - params_dtype: Optional[torch.dtype] = None, - ): - super().__init__() - self.act = act_module - self.input_is_parallel = input_is_parallel - if input_is_parallel: - tp_size = get_tensor_model_parallel_world_size() - intermediate_size_per_partition = divide(intermediate_size, - tp_size) - else: - intermediate_size_per_partition = intermediate_size - if params_dtype is None: - params_dtype = torch.get_default_dtype() - self.scales = nn.Parameter( - torch.empty(intermediate_size_per_partition, - dtype=params_dtype, - device="cuda")) - set_weight_attrs(self.scales, {"weight_loader": self.weight_loader}) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return self.act(x) / self.scales - - def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): - param_data = param.data - if self.input_is_parallel: - tp_rank = get_tensor_model_parallel_rank() - shard_size = param_data.shape[0] - start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(0, start_idx, shard_size) - assert param_data.shape == loaded_weight.shape - param_data.copy_(loaded_weight) - - -_ACTIVATION_REGISTRY = { - "gelu": nn.GELU(), - "gelu_fast": FastGELU(), - "gelu_new": NewGELU(), - "gelu_pytorch_tanh": nn.GELU(approximate="tanh"), - "relu": nn.ReLU(), -} - - -def get_act_fn( - act_fn_name: str, - quant_config: Optional[QuantizationConfig] = None, - intermediate_size: Optional[int] = None, - input_is_parallel: bool = True, - params_dtype: Optional[torch.dtype] = None, -) -> nn.Module: - """Get an activation function by name.""" - act_fn_name = act_fn_name.lower() - if act_fn_name not in _ACTIVATION_REGISTRY: - raise ValueError( - f"Activation function {act_fn_name!r} is not supported.") - - act_fn = _ACTIVATION_REGISTRY[act_fn_name] - if (quant_config is not None - and act_fn_name in quant_config.get_scaled_act_names()): - if intermediate_size is None: - raise ValueError("intermediate_size must be specified for scaled " - "activation functions.") - return ScaledActivation(act_fn, intermediate_size, input_is_parallel, - params_dtype) - return act_fn diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py deleted file mode 100644 index 91ed43f07c76ef78b4bf79b7f7ba80ccf3cd92d7..0000000000000000000000000000000000000000 --- a/vllm/model_executor/layers/attention.py +++ /dev/null @@ -1,302 +0,0 @@ -"""Multi-head attention.""" -from typing import List, Optional - -import torch -import torch.nn as nn -from xformers import ops as xops -from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask, - LowerTriangularMaskWithTensorBias) - -from vllm._C import ops -from vllm._C import cache_ops -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.triton_kernel.prefix_prefill import ( - context_attention_fwd) -from vllm.utils import is_hip - -_SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256] -# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. -_PARTITION_SIZE = 512 - - -class PagedAttention(nn.Module): - """MHA/MQA/GQA layer with PagedAttention. - - This class takes query, key, and value tensors as input. The input tensors - can either contain prompt tokens or generation tokens. - The class does the following: - - 1. Reshape and store the input key and value tensors in the KV cache. - 2. Perform (multi-head/multi-query/grouped-query) attention using either - xformers or the PagedAttention custom op. - 3. Return the output tensor. - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: Optional[int] = None, - alibi_slopes: Optional[List[float]] = None, - sliding_window: Optional[int] = None, - ) -> None: - super().__init__() - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads - self.sliding_window = sliding_window - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.register_buffer("alibi_slopes", alibi_slopes, persistent=False) - - assert self.num_heads % self.num_kv_heads == 0 - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - - if self.head_size not in _SUPPORTED_HEAD_SIZES: - raise ValueError(f"head_size ({self.head_size}) is not supported. " - f"Supported head sizes: {_SUPPORTED_HEAD_SIZES}.") - - def forward( - self, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - key_cache: Optional[torch.Tensor], - value_cache: Optional[torch.Tensor], - input_metadata: InputMetadata, - ) -> torch.Tensor: - """PagedAttention forward pass. - - Args: - query: shape = [batch_size, seq_len, num_heads * head_size] - key: shape = [batch_size, seq_len, num_kv_heads * head_size] - value: shape = [batch_size, seq_len, num_kv_heads * head_size] - key_cache: shape = [num_blocks, num_kv_heads, head_size/x, - block_size, x] - value_cache: shape = [num_blocks, num_kv_heads, head_size, - block_size] - input_metadata: metadata for the inputs. - Returns: - shape = [batch_size, seq_len, num_heads * head_size] - """ - batch_size, seq_len, hidden_size = query.shape - # Reshape the query, key, and value tensors. - query = query.view(-1, self.num_heads, self.head_size) - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - - # Reshape the keys and values and store them in the cache. - # If key_cache and value_cache are not provided, the new key and value - # vectors will not be cached. This happens during the initial memory - # profiling run. - if key_cache is not None and value_cache is not None: - cache_ops.reshape_and_cache( - key, - value, - key_cache, - value_cache, - input_metadata.slot_mapping.flatten(), - input_metadata.kv_cache_dtype, - ) - - if input_metadata.is_prompt: - # Prompt run. - if self.num_kv_heads != self.num_heads: - # As of Nov 2023, xformers only supports MHA. For MQA/GQA, - # project the key and value tensors to the desired number of - # heads. - # TODO(woosuk): Use MQA/GQA kernels for higher performance. - query = query.view(query.shape[0], self.num_kv_heads, - self.num_queries_per_kv, query.shape[-1]) - key = key[:, :, - None, :].expand(key.shape[0], self.num_kv_heads, - self.num_queries_per_kv, - key.shape[-1]) - value = value[:, :, None, :].expand(value.shape[0], - self.num_kv_heads, - self.num_queries_per_kv, - value.shape[-1]) - # normal attention - if (key_cache is None or value_cache is None - or input_metadata.block_tables.numel() == 0): - # Set attention bias if not provided. This typically happens at - # the very attention layer of every iteration. - # FIXME(woosuk): This is a hack. - if input_metadata.attn_bias is None: - if self.alibi_slopes is None: - attn_bias = BlockDiagonalCausalMask.from_seqlens( - [seq_len] * batch_size) - if self.sliding_window is not None: - attn_bias = attn_bias.make_local_attention( - self.sliding_window) - input_metadata.attn_bias = attn_bias - else: - input_metadata.attn_bias = _make_alibi_bias( - self.alibi_slopes, self.num_kv_heads, batch_size, - seq_len, query.dtype) - - # TODO(woosuk): Too many view operations. Let's try to reduce - # them in the future for code readability. - if self.alibi_slopes is None: - query = query.unsqueeze(0) - key = key.unsqueeze(0) - value = value.unsqueeze(0) - else: - query = query.unflatten(0, (batch_size, seq_len)) - key = key.unflatten(0, (batch_size, seq_len)) - value = value.unflatten(0, (batch_size, seq_len)) - - out = xops.memory_efficient_attention_forward( - query, - key, - value, - attn_bias=input_metadata.attn_bias, - p=0.0, - scale=self.scale, - op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if - (is_hip()) else None, - ) - output = out.view_as(query) - else: - # prefix-enabled attention - output = torch.empty_like(query) - context_attention_fwd( - query, - key, - value, - output, - key_cache, - value_cache, - input_metadata.block_tables, # [BS, max_block_per_request] - input_metadata.start_loc, - input_metadata.prompt_lens, - input_metadata.context_lens, - input_metadata.max_seq_len, - getattr(self, "alibi_slopes", None), - ) - - else: - # Decoding run. - output = _paged_attention( - query, - key_cache, - value_cache, - input_metadata, - self.num_kv_heads, - self.scale, - self.alibi_slopes, - ) - - # Reshape the output tensor. - return output.view(batch_size, seq_len, hidden_size) - - -def _make_alibi_bias( - alibi_slopes: torch.Tensor, - num_kv_heads: int, - batch_size: int, - seq_len: int, - dtype: torch.dtype, -) -> LowerTriangularMaskWithTensorBias: - bias = torch.arange(seq_len, dtype=dtype, device="cuda") - # NOTE(zhuohan): HF uses - # `bias = bias[None, :].repeat(prompt_len, 1)` - # here. We find that both biases give the same results, but - # the bias below more accurately follows the original ALiBi - # paper. - bias = bias[None, :] - bias[:, None] - - # When using custom attention bias, xformers requires the bias to - # be sliced from a tensor whose length is a multiple of 8. - padded_len = (seq_len + 7) // 8 * 8 - num_heads = alibi_slopes.shape[0] - bias = torch.empty( - batch_size, - num_heads, - seq_len, - padded_len, - device=alibi_slopes.device, - dtype=dtype, - )[:, :, :, :seq_len].copy_(bias) - bias.mul_(alibi_slopes[:, None, None]) - if num_heads != num_kv_heads: - bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads)) - attn_bias = LowerTriangularMaskWithTensorBias(bias) - return attn_bias - - -def _paged_attention( - query: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - input_metadata: InputMetadata, - num_kv_heads: int, - scale: float, - alibi_slopes: Optional[torch.Tensor], -) -> torch.Tensor: - output = torch.empty_like(query) - - block_size = value_cache.shape[3] - num_seqs, num_heads, head_size = query.shape - max_num_partitions = ( - (input_metadata.max_context_len + _PARTITION_SIZE - 1) // - _PARTITION_SIZE) - # NOTE(woosuk): We use a simple heuristic to decide whether to use - # PagedAttention V1 or V2. If the number of partitions is 1, we use - # V1 to avoid the overhead of reduction. Also, if the number of - # sequences or heads is large, we use V1 since there is enough work - # to parallelize. - # TODO(woosuk): Tune this heuristic. - # For context len > 8192, use V2 kernel to avoid shared memory shortage. - use_v1 = input_metadata.max_context_len <= 8192 and ( - max_num_partitions == 1 or num_seqs * num_heads > 512) - if use_v1: - # Run PagedAttention V1. - ops.paged_attention_v1( - output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - input_metadata.block_tables, - input_metadata.context_lens, - block_size, - input_metadata.max_context_len, - alibi_slopes, - input_metadata.kv_cache_dtype, - ) - else: - # Run PagedAttention V2. - assert _PARTITION_SIZE % block_size == 0 - tmp_output = torch.empty( - size=(num_seqs, num_heads, max_num_partitions, head_size), - dtype=output.dtype, - device=output.device, - ) - exp_sums = torch.empty( - size=(num_seqs, num_heads, max_num_partitions), - dtype=torch.float32, - device=output.device, - ) - max_logits = torch.empty_like(exp_sums) - ops.paged_attention_v2( - output, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - input_metadata.block_tables, - input_metadata.context_lens, - block_size, - input_metadata.max_context_len, - alibi_slopes, - input_metadata.kv_cache_dtype, - ) - return output diff --git a/vllm/model_executor/layers/fused_moe.py b/vllm/model_executor/layers/fused_moe.py deleted file mode 100644 index 998062d82d1f0099803c7cb4abbaf631c0aab22c..0000000000000000000000000000000000000000 --- a/vllm/model_executor/layers/fused_moe.py +++ /dev/null @@ -1,287 +0,0 @@ -"""Fused MoE kernel.""" -import torch -import triton -import triton.language as tl - -from vllm._C import ops - - -@triton.jit -def fused_moe_kernel( - # Pointers to matrices - a_ptr, - b_ptr, - c_ptr, - topk_weights_ptr, - sorted_token_ids_ptr, - expert_ids_ptr, - num_tokens_post_padded_ptr, - # Matrix dimensions - N, - K, - EM, - num_valid_tokens, - # The stride variables represent how much to increase the ptr by when moving by 1 - # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr` - # by to get the element one row down (A has M rows). - stride_am, - stride_ak, - stride_be, - stride_bk, - stride_bn, - stride_cm, - stride_cn, - # Meta-parameters - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, - MUL_ROUTED_WEIGHT: tl.constexpr, - top_k: tl.constexpr, - compute_type: tl.constexpr, -): - """ - Implements the fused computation for a Mixture of Experts (MOE) using token and expert matrices. - - Key Parameters: - - A: The input tensor representing tokens with shape (*, K), where '*' can be any shape representing batches and K is the feature dimension of each token. - - B: The stacked MOE weight tensor with shape (E, N, K), where E is the number of experts, K is the input feature dimension, and N is the output feature dimension. - - C: The output cache tensor with shape (M, topk, N), where M is the total number of tokens post padding, topk is the number of times each token is repeated, - and N is the output feature dimension. - - sorted_token_ids: A tensor containing the sorted indices of tokens, repeated topk times and arranged by the expert index they are assigned to. - - expert_ids: A tensor containing the indices of the expert for each block. It determines which expert matrix from B should be used for each block in A. - This kernel performs the multiplication of a token by its corresponding expert matrix as determined by `expert_ids`. The sorting of `sorted_token_ids` - by expert index and padding ensures divisibility by BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix multiplication across different blocks processed by the same expert. - """ - # ----------------------------------------------------------- - # Map program ids `pid` to the block of C it should compute. - # This is done in a grouped ordering to promote L2 data reuse. - pid = tl.program_id(axis=0) - num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M) - num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) - num_pid_in_group = GROUP_SIZE_M * num_pid_n - group_id = pid // num_pid_in_group - first_pid_m = group_id * GROUP_SIZE_M - group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) - pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) - pid_n = (pid % num_pid_in_group) // group_size_m - - # ---------------------------------------------------------- - # Create pointers for the first blocks of A and B. - # We will advance this pointer as we move in the K direction - # and accumulate - # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers - # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers - num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) - if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded: - return - offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) - offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) - token_mask = offs_token < num_valid_tokens - - offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + - offs_k[None, :] * stride_ak) - - off_experts = tl.load(expert_ids_ptr + pid_m) - b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk + - offs_bn[None, :] * stride_bn) - - # ----------------------------------------------------------- - # Iterate to compute a block of the C matrix. - # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block - # of fp32 values for higher accuracy. - # `accumulator` will be converted back to fp16 after the loop. - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - - for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): - # Load the next block of A and B, generate a mask by checking the K dimension. - a = tl.load(a_ptrs, - mask=token_mask[:, None] & - (offs_k[None, :] < K - k * BLOCK_SIZE_K), - other=0.0) - b = tl.load(b_ptrs, - mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, - other=0.0) - # We accumulate along the K dimension. - accumulator += tl.dot(a, b) - # Advance the ptrs to the next K block. - a_ptrs += BLOCK_SIZE_K * stride_ak - b_ptrs += BLOCK_SIZE_K * stride_bk - - if MUL_ROUTED_WEIGHT: - moe_weight = tl.load(topk_weights_ptr + offs_token, - mask=token_mask, - other=0) - accumulator = accumulator * moe_weight[:, None] - - accumulator = accumulator.to(compute_type) - # ----------------------------------------------------------- - # Write back the block of the output - offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[ - None, :] - c_mask = token_mask[:, None] & (offs_cn[None, :] < N) - tl.store(c_ptrs, accumulator, mask=c_mask) - - -def moe_align_block_size( - topk_ids: torch.Tensor, block_size: int, - num_experts: int) -> (torch.Tensor, torch.Tensor, torch.Tensor): - """ - Aligns the token distribution across experts to be compatible with block size for matrix multiplication. - - Parameters: - - topk_ids: A tensor of shape [total_tokens, top_k] representing the top-k expert indices for each token. - - block_size: The block size used in block matrix multiplication. - - num_experts: The total number of experts. - - Returns: - - sorted_token_ids: A tensor containing the sorted token indices according to their allocated expert. - - expert_ids: A tensor indicating the assigned expert index for each block. - - num_tokens_post_padded: The total number of tokens after padding, ensuring divisibility by block_size. - - This function pads the number of tokens that each expert needs to process so that it is divisible by block_size. - Padding ensures that during block matrix multiplication, the dimensions align correctly. - - Example: - Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]], block_size = 4, and num_experts = 4: - - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts, with each expert needing to process 3 tokens. - - As block_size is 4, we pad 1 token for each expert. - - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3]. - - Then append padding tokens [12, 12, 12, 12] for each block. - - After sorting by expert index, we obtain token_ids [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12]. - Tokens 12 are non-existent (padding) and are ignored in the subsequent matrix multiplication. - - The padding ensures that the total number of tokens is now divisible by block_size for proper block matrix operations. - """ - sorted_ids = torch.empty( - (topk_ids.numel() + num_experts * (block_size - 1), ), - dtype=torch.int32, - device=topk_ids.device) - expert_ids = torch.empty((topk_ids.numel() + num_experts, ), - dtype=torch.int32, - device=topk_ids.device) - sorted_ids.fill_(topk_ids.numel()) - num_tokens_post_pad = torch.empty((1), - dtype=torch.int32, - device=topk_ids.device) - ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids, - expert_ids, num_tokens_post_pad) - return sorted_ids, expert_ids, num_tokens_post_pad - - -def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, - topk_weights: torch.Tensor, topk_ids: torch.Tensor, - sorted_token_ids: torch.Tensor, - expert_ids: torch.Tensor, - num_tokens_post_padded: torch.Tensor, - mul_routed_weight: bool, top_k: int, config: dict): - - assert topk_weights.stride(1) == 1 - assert sorted_token_ids.stride(0) == 1 - - grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[ - 'BLOCK_SIZE_M']) * triton.cdiv(B.shape[1], META['BLOCK_SIZE_N']), ) - - fused_moe_kernel[grid]( - A, - B, - C, - topk_weights, - sorted_token_ids, - expert_ids, - num_tokens_post_padded, - B.shape[1], - B.shape[2], - sorted_token_ids.shape[0], - topk_ids.numel(), - A.stride(0), - A.stride(1), - B.stride(0), - B.stride(2), - B.stride(1), - C.stride(1), - C.stride(2), - MUL_ROUTED_WEIGHT=mul_routed_weight, - top_k=top_k, - compute_type=tl.bfloat16 if A.dtype == torch.bfloat16 else tl.float16, - **config, - ) - - -def fused_moe(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - inplace=False): - """ - This function computes a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2, and top-k gating mechanism. - - Parameters: - - hidden_states (torch.Tensor): The input tensor to the MoE layer. - - w1 (torch.Tensor): The first set of expert weights. - - w2 (torch.Tensor): The second set of expert weights. - - topk_weights (torch.Tensor): The weights for the top-k selected experts. - - topk_ids (torch.Tensor): The indices of the top-k selected experts. - - inplace (bool): If True, perform the operation in-place. Defaults to False. - - Returns: - - torch.Tensor: The output tensor after applying the MoE layer. - """ - # Check constraints. - assert hidden_states.shape[1] == w1.shape[2], "Incompatible dimensions" - assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" - assert w1.is_contiguous(), "Expert weights1 must be contiguous" - assert w2.is_contiguous(), "Expert weights2 must be contiguous" - assert hidden_states.dtype in [torch.float16, torch.bfloat16] - M, _ = hidden_states.shape - E, N, _ = w1.shape - - config = { - 'BLOCK_SIZE_M': 64, - 'BLOCK_SIZE_N': 64, - 'BLOCK_SIZE_K': 32, - 'GROUP_SIZE_M': 8 - } - - if topk_ids.numel() <= w1.shape[0]: - config = { - 'BLOCK_SIZE_M': 16, - 'BLOCK_SIZE_N': 32, - 'BLOCK_SIZE_K': 64, - 'GROUP_SIZE_M': 1 - } - - intermediate_cache1 = torch.empty((M, topk_ids.shape[1], N), - device=hidden_states.device, - dtype=hidden_states.dtype) - intermediate_cache2 = torch.empty((M * topk_ids.shape[1], N // 2), - device=hidden_states.device, - dtype=hidden_states.dtype) - intermediate_cache3 = torch.empty((M, topk_ids.shape[1], w2.shape[1]), - device=hidden_states.device, - dtype=hidden_states.dtype) - - sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size( - topk_ids, config['BLOCK_SIZE_M'], E) - - invoke_fused_moe_kernel(hidden_states, w1, intermediate_cache1, - topk_weights, topk_ids, sorted_token_ids, - expert_ids, num_tokens_post_padded, False, - topk_ids.shape[1], config) - - ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N)) - - invoke_fused_moe_kernel(intermediate_cache2, w2, intermediate_cache3, - topk_weights, topk_ids, sorted_token_ids, - expert_ids, num_tokens_post_padded, True, 1, - config) - - if inplace: - return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), - dim=1, - out=hidden_states) - return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), - dim=1) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py deleted file mode 100644 index cb3cee2bad5ad3a8c06838626469c465c0f60192..0000000000000000000000000000000000000000 --- a/vllm/model_executor/layers/layernorm.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Custom normalization layers.""" -from typing import Optional, Tuple, Union - -import torch -import torch.nn as nn - -from vllm._C import ops - - -class RMSNorm(nn.Module): - """Root mean square normalization. - - Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight. - Refer to https://arxiv.org/abs/1910.07467 - """ - - def __init__( - self, - hidden_size: int, - eps: float = 1e-6, - ) -> None: - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def _forward( - self, - x: torch.Tensor, - residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - """PyTorch-native implementation equivalent to forward().""" - orig_dtype = x.dtype - x = x.to(torch.float32) - if residual is not None: - x = x + residual.to(torch.float32) - residual = x.to(orig_dtype) - - variance = x.pow(2).mean(dim=-1, keepdim=True) - x = x * torch.rsqrt(variance + self.variance_epsilon) - x = x.to(orig_dtype) * self.weight - if residual is None: - return x - else: - return x, residual - - def forward( - self, - x: torch.Tensor, - residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - if residual is not None: - ops.fused_add_rms_norm( - x, - residual, - self.weight.data, - self.variance_epsilon, - ) - return x, residual - out = torch.empty_like(x) - ops.rms_norm( - out, - x, - self.weight.data, - self.variance_epsilon, - ) - return out diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py deleted file mode 100644 index 5e1d63a6a62eb7c4c0ad9526e5fef87ea7ed1682..0000000000000000000000000000000000000000 --- a/vllm/model_executor/layers/linear.py +++ /dev/null @@ -1,558 +0,0 @@ -from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional - -import torch -import torch.nn.functional as F -from torch.nn.parameter import Parameter - -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_all_reduce, tensor_model_parallel_all_gather) -from vllm.model_executor.parallel_utils.utils import ( - divide, split_tensor_along_last_dim) -from vllm.model_executor.utils import set_weight_attrs -from vllm.logger import init_logger - -logger = init_logger(__name__) - - -class LinearMethodBase(ABC): - """Base class for different (maybe quantized) linear methods.""" - - @abstractmethod - def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, - output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: - """Create weights for a linear layer.""" - raise NotImplementedError - - @abstractmethod - def apply_weights(self, - weights: Dict[str, torch.Tensor], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - """Apply the weights to the input tensor.""" - raise NotImplementedError - - -class UnquantizedLinearMethod(LinearMethodBase): - """Linear method without quantization. - - Args: - separate_bias_add: If true, add bias separately after matrix - multiplication. - """ - - def __init__(self, separate_bias_add: bool = False): - self.separate_bias_add = separate_bias_add - - def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, - output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: - weight = Parameter(torch.empty(output_size_per_partition, - input_size_per_partition, - device=torch.cuda.current_device(), - dtype=params_dtype), - requires_grad=False) - set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) - return {"weight": weight} - - def apply_weights(self, - weights: Dict[str, torch.Tensor], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - weight = weights["weight"] - if self.separate_bias_add: - if bias: - return F.linear(x, weight) + bias - return F.linear(x, weight) - return F.linear(x, weight, bias) - - -class ReplicatedLinear(torch.nn.Module): - """Replicated linear layer. - - Args: - input_size: input dimension of the linear layer. - output_size: output dimension of the linear layer. - bias: If true, add bias. - skip_bias_add: If true, skip adding bias but instead return it. - params_dtype: Data type for the parameters. - linear_method: (Maybe quantized) linear method. - """ - - def __init__( - self, - input_size: int, - output_size: int, - bias: bool = True, - skip_bias_add: bool = False, - params_dtype: Optional[torch.dtype] = None, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - - # Keep input parameters - self.input_size = input_size - self.output_size = output_size - self.skip_bias_add = skip_bias_add - if params_dtype is None: - params_dtype = torch.get_default_dtype() - self.params_dtype = params_dtype - if linear_method is None: - linear_method = UnquantizedLinearMethod() - self.linear_method = linear_method - self.linear_weights = self.linear_method.create_weights( - self.input_size, self.output_size, self.input_size, - self.output_size, self.params_dtype) - for name, weight in self.linear_weights.items(): - if isinstance(weight, torch.Tensor): - self.register_parameter(name, weight) - if bias: - self.bias = Parameter( - torch.empty(self.output_size, - device=torch.cuda.current_device(), - dtype=self.params_dtype)) - set_weight_attrs(self.bias, {"output_dim": 0}) - else: - self.register_parameter("bias", None) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - bias = self.bias if not self.skip_bias_add else None - output = self.linear_method.apply_weights(self.linear_weights, x, bias) - output_bias = self.bias if self.skip_bias_add else None - return output, output_bias - - -class ColumnParallelLinear(torch.nn.Module): - """Linear layer with column parallelism. - - The linear layer is defined as Y = XA + b. A is parallelized along - its second dimension as A = [A_1, ..., A_p]. - - Args: - input_size: first dimension of matrix A. - output_size: second dimension of matrix A. - bias: If true, add bias. - gather_output: If true, call all-gather on output and make Y available - to all GPUs, otherwise, every GPU will have its output - which is Y_i = XA_i - skip_bias_add: This was added to enable performance optimizations where - bias can be fused with other element-wise operations. we - skip adding bias but instead return it. - params_dtype: Data type for the parameters. - linear_method: (Maybe quantized) linear method. - """ - - def __init__( - self, - input_size: int, - output_size: int, - bias: bool = True, - gather_output: bool = False, - skip_bias_add: bool = False, - params_dtype: Optional[torch.dtype] = None, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - - # Keep input parameters - self.input_size = input_size - self.output_size = output_size - self.gather_output = gather_output - # Divide the weight matrix along the last dimension. - tp_size = get_tensor_model_parallel_world_size() - self.output_size_per_partition = divide(output_size, tp_size) - self.skip_bias_add = skip_bias_add - if params_dtype is None: - params_dtype = torch.get_default_dtype() - self.params_dtype = params_dtype - if linear_method is None: - linear_method = UnquantizedLinearMethod() - self.linear_method = linear_method - self.linear_weights = self.linear_method.create_weights( - self.input_size, self.output_size_per_partition, self.input_size, - self.output_size, self.params_dtype) - for name, weight in self.linear_weights.items(): - if isinstance(weight, torch.Tensor): - self.register_parameter(name, weight) - set_weight_attrs(weight, {"weight_loader": self.weight_loader}) - if bias: - self.bias = Parameter( - torch.empty(self.output_size_per_partition, - device=torch.cuda.current_device(), - dtype=params_dtype)) - set_weight_attrs(self.bias, { - "output_dim": 0, - "weight_loader": self.weight_loader, - }) - else: - self.register_parameter("bias", None) - - def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): - tp_rank = get_tensor_model_parallel_rank() - output_dim = getattr(param, "output_dim", None) - param_data = param.data - if output_dim is not None: - shard_size = param_data.shape[output_dim] - start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, - shard_size) - assert param_data.shape == loaded_weight.shape - param_data.copy_(loaded_weight) - - def forward(self, input_): - bias = self.bias if not self.skip_bias_add else None - - # Matrix multiply. - output_parallel = self.linear_method.apply_weights( - self.linear_weights, input_, bias) - if self.gather_output: - # All-gather across the partitions. - output = tensor_model_parallel_all_gather(output_parallel) - else: - output = output_parallel - output_bias = self.bias if self.skip_bias_add else None - return output, output_bias - - -class MergedColumnParallelLinear(ColumnParallelLinear): - """Packed linear layers with column parallelism. - - Similar to ColumnParallelLinear, but the weight matrix is concatenated - along the output dimension. When the weight matrix is loaded, the - different partitions are sharded separately. - - Args: - input_size: input dimension of the linear layer. - output_sizes: list of output dimensions of the linear layer. - bias: If true, add bias. - gather_output: If true, call all-gather on output and make the output - available to all GPUs, otherwise, every GPU will have - its own output. - skip_bias_add: This was added to enable performance optimizations where - bias can be fused with other element-wise operations. we - skip adding bias but instead return it. - params_dtype: Data type for the parameters. - linear_method: (Maybe quantized) linear method. - """ - - def __init__( - self, - input_size: int, - output_sizes: List[int], - bias: bool = True, - gather_output: bool = False, - skip_bias_add: bool = False, - params_dtype: Optional[torch.dtype] = None, - linear_method: Optional[LinearMethodBase] = None, - ): - self.output_sizes = output_sizes - tp_size = get_tensor_model_parallel_world_size() - assert all(output_size % tp_size == 0 for output_size in output_sizes) - super().__init__(input_size, sum(output_sizes), bias, gather_output, - skip_bias_add, params_dtype, linear_method) - - def weight_loader(self, - param: Parameter, - loaded_weight: torch.Tensor, - loaded_shard_id: Optional[int] = None): - param_data = param.data - output_dim = getattr(param, "output_dim", None) - if loaded_shard_id is None: - # Loaded weight is already packed. - if output_dim is None: - assert param_data.shape == loaded_weight.shape - param_data.copy_(loaded_weight) - return - current_shard_offset = 0 - shard_offsets = [] - for i, output_size in enumerate(self.output_sizes): - shard_offsets.append((i, current_shard_offset, output_size)) - current_shard_offset += output_size - packed_dim = getattr(param, "packed_dim", None) - for shard_id, shard_offset, shard_size in shard_offsets: - # If quantized, we need to adjust the offset and size to account - # for the packing. - if packed_dim == output_dim: - shard_size = shard_size // param.pack_factor - shard_offset = shard_offset // param.pack_factor - loaded_weight_shard = loaded_weight.narrow( - output_dim, shard_offset, shard_size) - self.weight_loader(param, loaded_weight_shard, shard_id) - return - - assert loaded_shard_id < len(self.output_sizes) - tp_rank = get_tensor_model_parallel_rank() - tp_size = get_tensor_model_parallel_world_size() - if output_dim is not None: - shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size - shard_size = self.output_sizes[loaded_shard_id] // tp_size - # If quantized, we need to adjust the offset and size to account - # for the packing. - packed_dim = getattr(param, "packed_dim", None) - if packed_dim == output_dim: - shard_size = shard_size // param.pack_factor - shard_offset = shard_offset // param.pack_factor - param_data = param_data.narrow(output_dim, shard_offset, - shard_size) - start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, - shard_size) - else: - ignore_warning = getattr(param, "ignore_warning", False) - if not ignore_warning: - logger.warning( - "Loading a weight without `output_dim` attribute in " - "MergedColumnParallelLinear, assume the weight is " - "the same for all partitions.") - assert param_data.shape == loaded_weight.shape - param_data.copy_(loaded_weight) - - -class QKVParallelLinear(ColumnParallelLinear): - """Linear layers for the attention's QKV transformation. - - Linear layers for the linear transformation of the query, key, and value - vectors in the attention layer. The weight matrix is concatenated along - the output dimension. The layer is parallelized along the head dimension. - When the number of key/value heads is smaller than the number of query - heads (e.g., multi-query/grouped-query attention), the key/value head may - be replicated while the query heads are partitioned. - - Args: - hidden_size: input hidden state size of the transformer. - head_size: size of each attention head. - total_num_heads: total number of attention query heads. - total_num_kv_heads: total number of attention key/value heads. If - None, assume total_num_kv_heads = total_num_heads. - bias: If true, add bias. - skip_bias_add: This was added to enable performance optimizations where - bias can be fused with other element-wise operations. we - skip adding bias but instead return it. - params_dtype: Data type for the parameters. - linear_method: (Maybe quantized) linear method. - """ - - def __init__( - self, - hidden_size: int, - head_size: int, - total_num_heads: int, - total_num_kv_heads: Optional[int] = None, - bias: bool = True, - skip_bias_add: bool = False, - params_dtype: Optional[torch.dtype] = None, - linear_method: Optional[LinearMethodBase] = None, - ): - self.hidden_size = hidden_size - self.head_size = head_size - self.total_num_heads = total_num_heads - if total_num_kv_heads is None: - total_num_kv_heads = total_num_heads - self.total_num_kv_heads = total_num_kv_heads - # Divide the weight matrix along the last dimension. - tp_size = get_tensor_model_parallel_world_size() - self.num_heads = divide(self.total_num_heads, tp_size) - if tp_size >= self.total_num_kv_heads: - self.num_kv_heads = 1 - self.num_kv_head_replicas = divide(tp_size, - self.total_num_kv_heads) - else: - self.num_kv_heads = divide(self.total_num_kv_heads, tp_size) - self.num_kv_head_replicas = 1 - input_size = self.hidden_size - output_size = (self.num_heads + - 2 * self.num_kv_heads) * tp_size * self.head_size - super().__init__(input_size, output_size, bias, False, skip_bias_add, - params_dtype, linear_method) - - def weight_loader(self, - param: Parameter, - loaded_weight: torch.Tensor, - loaded_shard_id: Optional[str] = None): - param_data = param.data - output_dim = getattr(param, "output_dim", None) - if loaded_shard_id is None: - # Loaded weight is already packed. - if output_dim is None: - assert param_data.shape == loaded_weight.shape - param_data.copy_(loaded_weight) - return - shard_offsets = [ - # (shard_id, shard_offset, shard_size) - ("q", 0, self.total_num_heads * self.head_size), - ("k", self.total_num_heads * self.head_size, - self.total_num_kv_heads * self.head_size), - ("v", (self.total_num_heads + self.total_num_kv_heads) * - self.head_size, self.total_num_kv_heads * self.head_size), - ] - packed_dim = getattr(param, "packed_dim", None) - for shard_id, shard_offset, shard_size in shard_offsets: - # If quantized, we need to adjust the offset and size to account - # for the packing. - if packed_dim == output_dim: - shard_size = shard_size // param.pack_factor - shard_offset = shard_offset // param.pack_factor - loaded_weight_shard = loaded_weight.narrow( - output_dim, shard_offset, shard_size) - self.weight_loader(param, loaded_weight_shard, shard_id) - return - - tp_rank = get_tensor_model_parallel_rank() - assert loaded_shard_id in ["q", "k", "v"] - if output_dim is not None: - if loaded_shard_id == "q": - shard_offset = 0 - shard_size = self.num_heads * self.head_size - elif loaded_shard_id == "k": - shard_offset = self.num_heads * self.head_size - shard_size = self.num_kv_heads * self.head_size - elif loaded_shard_id == "v": - shard_offset = (self.num_heads + - self.num_kv_heads) * self.head_size - shard_size = self.num_kv_heads * self.head_size - # If quantized, we need to adjust the offset and size to account - # for the packing. - packed_dim = getattr(param, "packed_dim", None) - if packed_dim == output_dim: - shard_size = shard_size // param.pack_factor - shard_offset = shard_offset // param.pack_factor - param_data = param_data.narrow(output_dim, shard_offset, - shard_size) - if loaded_shard_id == "q": - shard_id = tp_rank - else: - shard_id = tp_rank // self.num_kv_head_replicas - start_idx = shard_id * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, - shard_size) - else: - ignore_warning = getattr(param, "ignore_warning", False) - if not ignore_warning: - logger.warning( - "Loading a weight without `output_dim` attribute in " - "QKVParallelLinear, assume the weight is the same " - "for all partitions.") - assert param_data.shape == loaded_weight.shape - param_data.copy_(loaded_weight) - - -class RowParallelLinear(torch.nn.Module): - """Linear layer with row parallelism. - - The linear layer is defined as Y = XA + b. A is parallelized along - its first dimension and X along its second dimension as: - - - - | A_1 | - | . | - A = | . | X = [X_1, ..., X_p] - | . | - | A_p | - - - - Arguments: - input_size: first dimension of matrix A. - output_size: second dimension of matrix A. - bias: If true, add bias. Note that bias is not parallelized. - input_is_parallel: If true, we assume that the input is already - split across the GPUs and we do not split - again. - skip_bias_add: This was added to enable performance optimization where - bias can be fused with other element-wise operations. - We skip adding bias but instead return it. - params_dtype: Data type for the parameters. - linear_method: (Maybe quantized) linear method. - """ - - def __init__( - self, - input_size: int, - output_size: int, - bias: bool = True, - input_is_parallel: bool = True, - skip_bias_add: bool = False, - params_dtype: Optional[torch.dtype] = None, - reduce_results: bool = True, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - # Keep input parameters - self.input_size = input_size - self.output_size = output_size - self.input_is_parallel = input_is_parallel - self.reduce_results = reduce_results - if params_dtype is None: - params_dtype = torch.get_default_dtype() - self.params_dtype = params_dtype - - # Divide the weight matrix along the last dimension. - self.tp_size = get_tensor_model_parallel_world_size() - self.input_size_per_partition = divide(input_size, self.tp_size) - self.skip_bias_add = skip_bias_add - if linear_method is None: - linear_method = UnquantizedLinearMethod() - self.linear_method = linear_method - self.linear_weights = self.linear_method.create_weights( - self.input_size_per_partition, self.output_size, self.input_size, - self.output_size, self.params_dtype) - for name, weight in self.linear_weights.items(): - if isinstance(weight, torch.Tensor): - self.register_parameter(name, weight) - set_weight_attrs(weight, {"weight_loader": self.weight_loader}) - - if not reduce_results and (bias and not skip_bias_add): - raise ValueError("When not reduce the results, adding bias to the " - "results can lead to incorrect results") - - if bias: - self.bias = Parameter( - torch.empty(self.output_size, - device=torch.cuda.current_device(), - dtype=params_dtype)) - set_weight_attrs(self.bias, { - "output_dim": 0, - "weight_loader": self.weight_loader, - }) - else: - self.register_parameter("bias", None) - - def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): - tp_rank = get_tensor_model_parallel_rank() - input_dim = getattr(param, "input_dim", None) - param_data = param.data - if input_dim is not None: - shard_size = param_data.shape[input_dim] - start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(input_dim, start_idx, - shard_size) - assert param_data.shape == loaded_weight.shape - param_data.copy_(loaded_weight) - - def forward(self, input_): - # Set up backprop all-reduce. - if self.input_is_parallel: - input_parallel = input_ - else: - tp_rank = get_tensor_model_parallel_rank() - splitted_input = split_tensor_along_last_dim( - input_, num_partitions=self.tp_size) - input_parallel = splitted_input[tp_rank].contiguous() - - # Matrix multiply. - output_parallel = self.linear_method.apply_weights( - self.linear_weights, input_parallel) - if self.reduce_results and self.tp_size > 1: - output_ = tensor_model_parallel_all_reduce(output_parallel) - else: - output_ = output_parallel - - if not self.skip_bias_add: - output = output_ + self.bias if self.bias is not None else output_ - output_bias = None - else: - output = output_ - output_bias = self.bias - return output, output_bias diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py deleted file mode 100644 index b3449eaff0e35d6109b85cf2d5437a6f51877f93..0000000000000000000000000000000000000000 --- a/vllm/model_executor/layers/quantization/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -from typing import Type - -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig -from vllm.model_executor.layers.quantization.awq import AWQConfig -from vllm.model_executor.layers.quantization.gptq import GPTQConfig -from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig - -_QUANTIZATION_CONFIG_REGISTRY = { - "awq": AWQConfig, - "gptq": GPTQConfig, - "squeezellm": SqueezeLLMConfig, -} - - -def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: - if quantization not in _QUANTIZATION_CONFIG_REGISTRY: - raise ValueError(f"Invalid quantization method: {quantization}") - return _QUANTIZATION_CONFIG_REGISTRY[quantization] - - -__all__ = [ - "QuantizationConfig", - "get_quantization_config", -] diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py deleted file mode 100644 index 4d3fd3ec0cc715cfca1e4f69fffbb6861dae6f98..0000000000000000000000000000000000000000 --- a/vllm/model_executor/layers/quantization/awq.py +++ /dev/null @@ -1,168 +0,0 @@ -from typing import Any, Dict, List, Optional - -import torch -from torch.nn.parameter import Parameter - -from vllm._C import ops -from vllm.model_executor.layers.linear import (LinearMethodBase, - set_weight_attrs) -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig - - -class AWQConfig(QuantizationConfig): - """Config class for AWQ. - - Reference: https://arxiv.org/abs/2306.00978 - """ - - def __init__( - self, - weight_bits: int, - group_size: int, - zero_point: bool, - ) -> None: - self.weight_bits = weight_bits - self.group_size = group_size - self.zero_point = zero_point - - if self.weight_bits != 4: - raise ValueError( - "Currently, only 4-bit weight quantization is supported for " - f"AWQ, but got {self.weight_bits} bits.") - self.pack_factor = 32 // self.weight_bits - - def __repr__(self) -> str: - return (f"AWQConfig(weight_bits={self.weight_bits}, " - f"group_size={self.group_size}, " - f"zero_point={self.zero_point})") - - def get_name(self) -> str: - return "awq" - - def get_supported_act_dtypes(self) -> List[torch.dtype]: - return [torch.half] - - def get_min_capability(self) -> int: - # The AWQ kernel only supports Turing or newer GPUs. - return 75 - - @staticmethod - def get_config_filenames() -> List[str]: - return [ - "quant_config.json", # E.g., casperhansen/vicuna-7b-v1.5-awq - "quantize_config.json", # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq - ] - - @classmethod - def from_config(cls, config: Dict[str, Any]) -> "AWQConfig": - weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) - group_size = cls.get_from_keys(config, ["q_group_size", "group_size"]) - zero_point = cls.get_from_keys(config, ["zero_point"]) - return cls(weight_bits, group_size, zero_point) - - def get_linear_method(self) -> "AWQLinearMethod": - return AWQLinearMethod(self) - - def get_scaled_act_names(self) -> List[str]: - return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"] - - -class AWQLinearMethod(LinearMethodBase): - """Linear method for AWQ. - - Args: - quant_config: The AWQ quantization config. - """ - - def __init__(self, quant_config: AWQConfig): - self.quant_config = quant_config - - def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, - output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: - if input_size_per_partition % self.quant_config.group_size != 0: - raise ValueError( - "The input size is not aligned with the quantized " - "weight shape. This can be caused by too large " - "tensor parallel size.") - if output_size_per_partition % self.quant_config.pack_factor != 0: - raise ValueError( - "The output size is not aligned with the quantized " - "weight shape. This can be caused by too large " - "tensor parallel size.") - - qweight = Parameter( - torch.empty( - input_size_per_partition, - output_size_per_partition // self.quant_config.pack_factor, - device="cuda", - dtype=torch.int32, - ), - requires_grad=False, - ) - set_weight_attrs( - qweight, { - "input_dim": 0, - "output_dim": 1, - "packed_dim": 1, - "pack_factor": self.quant_config.pack_factor, - }) - qzeros = Parameter( - torch.empty( - input_size_per_partition // self.quant_config.group_size, - output_size_per_partition // self.quant_config.pack_factor, - device="cuda", - dtype=torch.int32, - ), - requires_grad=False, - ) - set_weight_attrs( - qzeros, { - "input_dim": 0, - "output_dim": 1, - "packed_dim": 1, - "pack_factor": self.quant_config.pack_factor, - }) - scales = Parameter( - torch.empty( - input_size_per_partition // self.quant_config.group_size, - output_size_per_partition, - device="cuda", - dtype=params_dtype, - ), - requires_grad=False, - ) - set_weight_attrs(scales, { - "input_dim": 0, - "output_dim": 1, - }) - return { - "qweight": qweight, - "qzeros": qzeros, - "scales": scales, - } - - def apply_weights(self, - weights: Dict[str, Any], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - qweight = weights["qweight"] - qzeros = weights["qzeros"] - scales = weights["scales"] - pack_factor = self.quant_config.pack_factor - out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, )) - reshaped_x = x.reshape(-1, x.shape[-1]) - - # num_tokens >= threshold - FP16_MATMUL_HEURISTIC_CONDITION = x.shape[:-1].numel() >= 256 - - if FP16_MATMUL_HEURISTIC_CONDITION: - out = ops.awq_dequantize(qweight, scales, qzeros, 0, 0, 0) - out = torch.matmul(reshaped_x, out) - else: - out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros, - pack_factor) - if bias is not None: - out = out + bias - return out.reshape(out_shape) diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py deleted file mode 100644 index 6115e7c3be9565bd27b399d67548e75396d324cf..0000000000000000000000000000000000000000 --- a/vllm/model_executor/layers/quantization/base_config.py +++ /dev/null @@ -1,64 +0,0 @@ -from abc import ABC, abstractmethod -from typing import Any, Dict, List - -import torch - -from vllm.model_executor.layers.linear import LinearMethodBase - - -class QuantizationConfig(ABC): - """Base class for quantization configs.""" - - @abstractmethod - def get_name(self) -> str: - """Name of the quantization method.""" - raise NotImplementedError - - @abstractmethod - def get_supported_act_dtypes(self) -> List[torch.dtype]: - """List of supported activation dtypes.""" - raise NotImplementedError - - @abstractmethod - def get_min_capability(self) -> int: - """Minimum GPU capability to support the quantization method. - - E.g., 70 for Volta, 75 for Turing, 80 for Ampere. - This requirement is due to the custom CUDA kernels used by the - quantization method. - """ - raise NotImplementedError - - @staticmethod - @abstractmethod - def get_config_filenames() -> List[str]: - """List of filenames to search for in the model directory.""" - raise NotImplementedError - - @classmethod - @abstractmethod - def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig": - """Create a config class from the model's quantization config.""" - raise NotImplementedError - - @staticmethod - def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any: - """Get a value from the model's quantization config.""" - for key in keys: - if key in config: - return config[key] - raise ValueError(f"Cannot find any of {keys} in the model's " - "quantization config.") - - @abstractmethod - def get_linear_method(self) -> LinearMethodBase: - """Get the linear method to use for the quantized linear layer.""" - raise NotImplementedError - - @abstractmethod - def get_scaled_act_names(self) -> List[str]: - """Returns the activation function names that should be post-scaled. - - For now, this is only used by AWQ. - """ - raise NotImplementedError diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py deleted file mode 100644 index 8fe96e7ddb98dc61955b1d5af413b1de79aa3b0f..0000000000000000000000000000000000000000 --- a/vllm/model_executor/layers/quantization/gptq.py +++ /dev/null @@ -1,215 +0,0 @@ -import enum -from enum import Enum -from typing import Any, Dict, List, Optional - -import torch -from torch.nn.parameter import Parameter - -from vllm._C import ops -from vllm.model_executor.layers.linear import (LinearMethodBase, - set_weight_attrs) -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) - - -class GPTQConfig(QuantizationConfig): - """Config class for GPTQ. - - Reference: https://arxiv.org/abs/2210.17323 - """ - - def __init__( - self, - weight_bits: int, - group_size: int, - desc_act: bool, - ) -> None: - self.weight_bits = weight_bits - self.group_size = group_size - self.desc_act = desc_act - self.pack_factor = 32 // self.weight_bits - # exllama kernel v1 only supports 4 bit - if self.weight_bits != 4: - raise ValueError( - "Currently, only 4-bit weight quantization is supported for " - f"GPTQ, but got {self.weight_bits} bits.") - - def __repr__(self) -> str: - return (f"GPTQConfig(weight_bits={self.weight_bits}, " - f"group_size={self.group_size}, " - f"desc_act={self.desc_act})") - - @classmethod - def get_name(cls) -> str: - return "gptq" - - @classmethod - def get_supported_act_dtypes(cls) -> List[torch.dtype]: - return [torch.half] - - @classmethod - # Need to figure it out - def get_min_capability(cls) -> int: - return 60 - - @classmethod - def get_config_filenames(cls) -> List[str]: - return ["quantize_config.json"] - - @classmethod - def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig": - weight_bits = cls.get_from_keys(config, ["bits"]) - group_size = cls.get_from_keys(config, ["group_size"]) - desc_act = cls.get_from_keys(config, ["desc_act"]) - return cls(weight_bits, group_size, desc_act) - - def get_linear_method(self) -> "GPTQLinearMethod": - return GPTQLinearMethod(self) - - def get_scaled_act_names(self) -> List[str]: - return [] - - -class ExllamaState(Enum): - - UNUSED = enum.auto() - UNINITIALIZED = enum.auto() - READY = enum.auto() - - -class GPTQLinearMethod(LinearMethodBase): - """Linear method for GPTQ. - - Args: - quant_config: The GPTQ quantization config. - """ - - def __init__(self, quant_config: GPTQConfig): - self.quant_config = quant_config - - def create_weights( - self, - input_size_per_partition: int, - output_size_per_partition: int, - input_size: int, - output_size: int, - params_dtype: torch.dtype, - ) -> Dict[str, Any]: - del output_size # Unused. - if input_size_per_partition % self.quant_config.group_size != 0: - raise ValueError( - "The input size is not aligned with the quantized " - "weight shape. This can be caused by too large " - "tensor parallel size.") - if output_size_per_partition % self.quant_config.pack_factor != 0: - raise ValueError( - "The output size is not aligned with the quantized " - "weight shape. This can be caused by too large " - "tensor parallel size.") - - if self.quant_config.group_size != -1: - group_size = self.quant_config.group_size - else: - group_size = input_size - exllama_state = ExllamaState.UNINITIALIZED - scale_and_zero_size = input_size // group_size - scale_and_zero_input_dim = None - if input_size != input_size_per_partition and self.quant_config.group_size != -1: - # For act-order models, we cannot use Exllama for row parallel layer - if self.quant_config.desc_act: - exllama_state = ExllamaState.UNUSED - else: - # we need to partition qzeros and scales for exllama kernel - scale_and_zero_size = input_size_per_partition // group_size - scale_and_zero_input_dim = 0 - - qweight = Parameter( - torch.empty( - input_size_per_partition // self.quant_config.pack_factor, - output_size_per_partition, - device="cuda", - dtype=torch.int32, - ), - requires_grad=False, - ) - set_weight_attrs( - qweight, { - "input_dim": 0, - "output_dim": 1, - "packed_dim": 0, - "pack_factor": self.quant_config.pack_factor, - }) - g_idx = Parameter( - torch.tensor( - [ - i // self.quant_config.group_size - for i in range(input_size_per_partition) - ], - device="cuda", - dtype=torch.int32, - ), - requires_grad=False, - ) - # Ignore warning from fused linear layers such as QKVParallelLinear. - set_weight_attrs(g_idx, {"input_dim": 0, "ignore_warning": True}) - qzeros = Parameter( - torch.empty( - scale_and_zero_size, - output_size_per_partition // self.quant_config.pack_factor, - device="cuda", - dtype=torch.int32, - ), - requires_grad=False, - ) - set_weight_attrs( - qzeros, { - "input_dim": scale_and_zero_input_dim, - "output_dim": 1, - "packed_dim": 1, - "pack_factor": self.quant_config.pack_factor, - }) - scales = Parameter( - torch.empty( - scale_and_zero_size, - output_size_per_partition, - device="cuda", - dtype=params_dtype, - ), - requires_grad=False, - ) - set_weight_attrs(scales, { - "input_dim": scale_and_zero_input_dim, - "output_dim": 1, - }) - return { - "qweight": qweight, - "g_idx": g_idx, - "qzeros": qzeros, - "scales": scales, - "exllama_state": exllama_state, - } - - def apply_weights(self, - weights: Dict[str, Any], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - qweight = weights["qweight"] - out_shape = x.shape[:-1] + (qweight.shape[-1], ) - reshaped_x = x.reshape(-1, x.shape[-1]) - # exllama needs to shuffle the weight after the weight is loaded - # here we do the shuffle on first forward pass - if weights["exllama_state"] == ExllamaState.UNINITIALIZED: - if self.quant_config.desc_act: - weights["g_idx"] = torch.argsort(weights["g_idx"]).to( - torch.int) - else: - weights["g_idx"] = torch.empty((1, 1), device="meta") - weights["exllama_state"] = ExllamaState.READY - ops.gptq_shuffle(weights["qweight"], weights["g_idx"]) - output = ops.gptq_gemm(reshaped_x, weights["qweight"], - weights["qzeros"], weights["scales"], - weights["g_idx"], - weights["exllama_state"] == ExllamaState.READY) - if bias is not None: - output = output + bias - return output.reshape(out_shape) diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py deleted file mode 100644 index 1932bd145076b971860771c1ec24aecef9a0c2d1..0000000000000000000000000000000000000000 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ /dev/null @@ -1,131 +0,0 @@ -from typing import Any, Dict, List, Optional - -import torch -from torch.nn.parameter import Parameter - -from vllm._C import ops -from vllm.model_executor.layers.linear import (LinearMethodBase, - set_weight_attrs) -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig -from vllm.utils import is_hip - - -class SqueezeLLMConfig(QuantizationConfig): - """Config class for SqueezeLLM. - - Reference: https://arxiv.org/pdf/2306.07629 - """ - - def __init__( - self, - weight_bits: int, - ) -> None: - self.weight_bits = weight_bits - - if self.weight_bits != 4: - raise ValueError( - "Currently, only 4-bit weight quantization is supported for " - f"SqueezeLLM, but got {self.weight_bits} bits.") - - self.pack_factor = 32 // self.weight_bits - - def __repr__(self) -> str: - return f"SqueezeLLMConfig(weight_bits={self.weight_bits})" - - def get_name(self) -> str: - return "squeezellm" - - def get_supported_act_dtypes(self) -> List[torch.dtype]: - return [torch.half] - - def get_min_capability(self) -> int: - return 70 - - @staticmethod - def get_config_filenames() -> List[str]: - return ["quant_config.json"] - - @classmethod - def from_config(cls, config: Dict[str, Any]) -> "SqueezeLLMConfig": - weight_bits = cls.get_from_keys(config, ["wbits"]) - return cls(weight_bits) - - def get_linear_method(self) -> "SqueezeLLMLinearMethod": - return SqueezeLLMLinearMethod(self) - - def get_scaled_act_names(self) -> List[str]: - return [] - - -class SqueezeLLMLinearMethod(LinearMethodBase): - """Linear method for SqueezeLLM. - - Args: - quant_config: The SqueezeLLM quantization config. - """ - - def __init__(self, quant_config: SqueezeLLMConfig): - self.quant_config = quant_config - - def create_weights(self, input_size_per_partition: int, - output_size_per_partition: int, input_size: int, - output_size: int, - params_dtype: torch.dtype) -> Dict[str, Any]: - if input_size_per_partition % self.quant_config.pack_factor != 0: - raise ValueError( - "The input size is not aligned with the quantized " - "weight shape. This can be caused by too large " - "tensor parallel size.") - qweight = Parameter( - torch.empty( - input_size_per_partition // self.quant_config.pack_factor, - output_size_per_partition, - device="cuda", - dtype=torch.int32, - ), - requires_grad=False, - ) - set_weight_attrs( - qweight, { - "input_dim": 0, - "output_dim": 1, - "packed_dim": 0, - "pack_factor": self.quant_config.pack_factor, - }) - lookup_table = Parameter( - torch.empty( - output_size, - self.quant_config.weight_bits**2, - device="cuda", - dtype=params_dtype, - ), - requires_grad=False, - ) - set_weight_attrs(lookup_table, { - "output_dim": 0, - }) - return { - "qweight": qweight, - "lookup_table": lookup_table, - } - - def apply_weights(self, - weights: Dict[str, Any], - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - qweight = weights["qweight"] - lookup_table = weights["lookup_table"] - out_shape = x.shape[:-1] + (qweight.shape[-1], ) - reshaped_x = x.reshape(-1, x.shape[-1]) - if is_hip(): - out_f = torch.zeros(out_shape, device="cuda", dtype=torch.float) - ops.squeezellm_gemm(reshaped_x, qweight, out_f, lookup_table) - out = out_f.to(dtype=torch.float16) - else: - # NOTE: The output tensor should be zero-initialized. - out = torch.zeros(out_shape, device="cuda", dtype=torch.float16) - ops.squeezellm_gemm(reshaped_x, qweight, out, lookup_table) - - if bias is not None: - out = out + bias - return out.reshape(out_shape) diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py deleted file mode 100644 index 3e1cfc783b8efd4228ba198cf7a2c8f75f6e3f2d..0000000000000000000000000000000000000000 --- a/vllm/model_executor/layers/rejection_sampler.py +++ /dev/null @@ -1,392 +0,0 @@ -from typing import Tuple, Optional -from functools import cached_property - -import torch -import torch.nn as nn -import torch.jit - - -class RejectionSampler(nn.Module): - """Apply modified rejection sampling as described in "Accelerating Large - Language Model Decoding with Speculative Sampling" - https://arxiv.org/pdf/2302.01318.pdf. - """ - - def __init__(self, strict_mode: bool = False): - """Create a rejection sampler. - - Args: - strict_mode: Whether or not to perform shape/device/dtype checks - during sampling. This catches correctness issues but adds - nontrivial latency. - """ - super().__init__() - self.probs_dtype = torch.float32 - self.token_id_dtype = torch.int64 - self._strict_mode = strict_mode - - # NOTE: A "bonus token" is accepted iff all proposal tokens are - # accepted. There is always only one possible bonus token. We store this - # value in a variable for readability. - self._num_bonus_tokens = 1 - - self.num_accepted_tokens: Optional[torch.Tensor] = None - self.num_emitted_tokens: Optional[torch.Tensor] = None - self.num_draft_tokens: int = 0 - - def init_gpu_tensors(self, rank: int) -> None: - assert self.num_accepted_tokens is None - device = f"cuda:{rank}" - self.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - self.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - - def forward( - self, - target_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> torch.Tensor: - """Sample token ids using rejection sampling. This accepts or rejects - tokens proposed by the draft model using the probability of each token - according to the draft and target models. - - In the worst case where all draft tokens are rejected, it is guaranteed - one correct token will be emitted. - - In the case where all draft tokens are accepted, a bonus token will be - accepted as its cheap to have the target model score this speculative - sequence. - - Args: - target_probs: The probability distribution over token ids given - context according to the target model. - shape = [batch_size, num_speculative_tokens, vocab_size] - - bonus_token_ids: The "bonus" token ids that are accepted iff all - speculative tokens in a sequence are accepted. - shape = [batch_size, num_bonus_tokens] - - draft_probs: The probability distribution over token ids given - context according to the draft model. - shape = [batch_size, num_speculative_tokens, vocab_size] - - draft_token_ids: The token ids that were sampled from the draft - probabilities. - shape = [batch_size, num_speculative_tokens] - - Returns: - output_token_ids: The token ids sampled via rejection sampling, - or -1 if unable to sample a token because the previous token - was rejected. - shape = [batch_size, num_speculative_tokens + num_bonus_tokens] - """ - # Only perform shape/dtype/device checking in strict mode, as it adds - # overhead. - if self._strict_mode: - self._raise_if_incorrect_shape(target_probs, bonus_token_ids, - draft_probs, draft_token_ids) - self._raise_if_incorrect_dtype(target_probs, bonus_token_ids, - draft_probs, draft_token_ids) - self._raise_if_inconsistent_device(target_probs, bonus_token_ids, - draft_probs, draft_token_ids) - self._raise_if_out_of_bounds_vocab(target_probs.shape[-1], - bonus_token_ids, - draft_token_ids) - - accepted, recovered_token_ids = self._batch_modified_rejection_sampling( - target_probs, - draft_probs, - draft_token_ids, - ) - - output_token_ids = self._create_output( - accepted, - recovered_token_ids, - draft_token_ids, - bonus_token_ids, - ) - return output_token_ids - - def _batch_modified_rejection_sampling( - self, - target_probs: torch.Tensor, # [batch_size, k, vocab_size] - draft_probs: torch.Tensor, # [batch_size, k, vocab_size] - draft_token_ids: torch.Tensor, # [batch_size, k] - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Perform modified rejection sampling on each sequence. - - Returns: - A tuple of two tensors: - 0: A bool tensor of which tokens in each sequence is accepted. - shape = [batch_size, k] - 1: Token ids sampled from a recovered distribution, to be used - when a token is rejected. - shape = [batch_size, k] - """ - - batch_size, k, vocab_size = draft_probs.shape - - # shape [batch_size, k] - accepted = self._get_accepted(target_probs, draft_probs, - draft_token_ids) - - recovered_probs = self._get_recovered_probs( - target_probs, draft_probs).reshape(batch_size * k, vocab_size) - - recovered_token_ids = _multinomial(recovered_probs, - num_samples=1).reshape( - batch_size, k) - return accepted, recovered_token_ids - - def _get_accepted( - self, - target_probs: torch.Tensor, # [batch_size, k, vocab_size] - draft_probs: torch.Tensor, # [batch_size, k, vocab_size] - draft_token_ids: torch.Tensor, # [batch_size, k] - ) -> torch.Tensor: - r"""Create bool matrix over the proposed draft tokens. If - True, then a token can be accepted, else it should be - rejected. - - Given :math:`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of - :math:`\hat{x}_{n+1}` given context :math:`x_1, \dots, x_n` according - to the target model, and :math:`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the - same conditional probability according to the draft model, the token - is accepted with probability: - - .. math:: - \min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)} - {p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right) - - This implementation does not apply causality. When using the output, - if a token is rejected, subsequent tokens should not be used. - - Returns a bool tensor of shape [batch_size, k] specifying which tokens - are accepted. - """ - batch_size, k, _ = draft_probs.shape - batch_indices = torch.arange(batch_size, - device=target_probs.device)[:, None] - probs_indicies = torch.arange(k, device=target_probs.device) - - # shape [batch_size, k] - selected_draft_probs = draft_probs[batch_indices, probs_indicies, - draft_token_ids] - - # shape [batch_size, k] - selected_target_probs = target_probs[batch_indices, probs_indicies, - draft_token_ids] - - uniform_rand = torch.rand(batch_size, - k, - dtype=self.probs_dtype, - device=target_probs.device) - capped_ratio = torch.minimum( - selected_target_probs / selected_draft_probs, - torch.full((1, ), 1, device=target_probs.device)) - accepted = uniform_rand < capped_ratio - - return accepted - - def _get_recovered_probs( - self, - target_probs: torch.Tensor, # [k, vocab_size] - draft_probs: torch.Tensor, # [k, vocab_size] - ) -> torch.Tensor: - r"""Create a probability distribution for each proposed token which can - be sampled if the proposed token is rejected. - - When this routine is applied sequentially, the true distribution of the - target model is recovered (within hardware numerics). - - The probability distribution used in this rejection case is constructed - as follows. Given :math:`q(x|x_1, \dots, x_n)`, the probability of - :math:`x` given context :math:`x_1, \dots, x_n` according to the target - model and :math:`p(x|x_1, \dots, x_n)`, the same conditional probability - according to the draft model: - - .. math:: - x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+ - - where :math:`(f(x))_+` is defined as: - - .. math:: - (f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))} - - See https://github.com/vllm-project/vllm/pull/2336 for a visualization - of the draft, target, and recovered probability distributions. - - Returns a tensor of shape [batch_size, k, vocab_size]. - - Note: This batches operations on GPU and thus constructs the recovered - distribution for all tokens, even if they are accepted. This causes - division-by-zero errors, so we use self._smallest_positive_value to - avoid that. This introduces some drift to the distribution. - """ - _, k, _ = draft_probs.shape - - # shape [batch_size, k, vocab_size] - difference = target_probs - draft_probs - - # TODO(cade): Can we use logprobs instead of probs, and avoid the - # division-by-zero errors without introducing distribution drift? - - # shape [batch_size, k, vocab_size] - f = torch.clamp(difference, min=self._smallest_positive_value) - - # shape [batch_size, k, vocab_size] - recovered_probs = f / torch.sum(f, dim=-1).reshape(-1, k, 1) - - return recovered_probs - - @cached_property - def _smallest_positive_value(self) -> float: - """Return the smallest positive value representable by the probs dtype. - This value is used when constructing a distribution from which to sample - recovered tokens in the first rejection case. - - See _get_recovered_probs for more details - - Note that this isn't actually the smallest positive value representable - by float32, but the smallest positive normal value. - See https://en.wikipedia.org/wiki/Subnormal_number for more information. - """ - return torch.finfo(self.probs_dtype).tiny - - def _create_output( - self, - accepted: torch.Tensor, # [batch_size, k] - recovered_token_ids: torch.Tensor, # [batch_size, k] - draft_token_ids: torch.Tensor, # [batch_size, k] - bonus_token_ids: torch.Tensor, # [batch_size] - ) -> torch.Tensor: - """Format output. Returns a matrix of token ids. When - a token is rejected via rejection sampling, all subsequent - token ids are set to -1 for the sequence. - - shape = [batch_size, k + num_bonus_tokens] - """ - bonus_token_ids = bonus_token_ids.squeeze() - batch_size, k = recovered_token_ids.shape - - # Determine the index of the first False value for each row. - limits = (accepted == 0).max(1).indices - limits[~(accepted == 0).any(1)] = k - - # Create masks using the indices. - indices = torch.arange(k, device=accepted.device).unsqueeze(0) - accepted_mask = indices < limits.unsqueeze(1) - after_false_mask = indices == limits.unsqueeze(1) - - # Create an extended output tensor - output_with_bonus_tokens = -torch.ones( - (batch_size, k + self._num_bonus_tokens), - dtype=self.token_id_dtype, - device=accepted.device) - output = output_with_bonus_tokens[:, :k] - - # Fill in the first k columns of the output tensor using masks and data - # tensors. - output[:, :k] = torch.where(accepted_mask, draft_token_ids, - -torch.ones_like(draft_token_ids)) - - # Fill the last column. - # We check output directly as accepted may have True values inconsistent - # with causal acceptance. - output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1, - bonus_token_ids, -1) - - # Fill the recovered token ids. - output.mul_(~after_false_mask).add_( - recovered_token_ids.mul(after_false_mask)) - - self.num_accepted_tokens += accepted.sum() - self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum() - self.num_draft_tokens += batch_size * k - - return output_with_bonus_tokens - - def _raise_if_incorrect_shape( - self, - target_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> None: - (target_batch_size, num_target_probs, - target_vocab_size) = target_probs.shape - bonus_batch_size, num_bonus_tokens = bonus_token_ids.shape - draft_batch_size, num_draft_probs, draft_vocab_size = draft_probs.shape - draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape - - assert draft_batch_size == target_batch_size - assert num_draft_probs == num_target_probs - assert (draft_vocab_size == target_vocab_size - ), f"{draft_vocab_size=} {target_vocab_size=}" - - assert draft_token_ids_batch_size == draft_batch_size - assert num_draft_token_ids == num_draft_probs - - assert bonus_batch_size == target_batch_size - assert num_bonus_tokens == self._num_bonus_tokens - - def _raise_if_incorrect_dtype( - self, - target_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> None: - assert all(probs.dtype == self.probs_dtype - for probs in [target_probs, draft_probs]) - assert all(token_ids.dtype == self.token_id_dtype - for token_ids in [bonus_token_ids, draft_token_ids]) - - def _raise_if_inconsistent_device( - self, - target_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> None: - devices = [ - t.device for t in - [target_probs, bonus_token_ids, draft_probs, draft_token_ids] - ] - assert all([devices[0] == device for device in devices]) - - def _raise_if_out_of_bounds_vocab( - self, - vocab_size: int, - bonus_token_ids: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> None: - assert torch.all(bonus_token_ids < vocab_size) - assert torch.all(bonus_token_ids >= 0) - assert torch.all(draft_token_ids < vocab_size) - assert torch.all(draft_token_ids >= 0) - - -# torch.multinomial forces a GPU<->CPU sync. -# Therefore, we use an optimized implementation instead that skips the sync. -# Note that we always sample with replacement. -# probs will be modified in place, but this is fine, as we pass -# in a copy already. -@torch.jit.script -def _multinomial( - probs: torch.Tensor, - num_samples: int, -) -> torch.Tensor: - if num_samples > 1: - # This is equivalent to torch.repeat_interleaved (which also - # forces a GPU<->CPU sync). - probs = probs[:, None, :].expand(probs.shape[0], num_samples, - probs.shape[1]).contiguous().view( - -1, probs.shape[1]) - q = torch.empty_like(probs).exponential_(1.0) - return probs.div_(q).argmax(dim=1).view(-1, num_samples) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py deleted file mode 100644 index 91c093e33e3c94285e4988e81aa615bb480edd27..0000000000000000000000000000000000000000 --- a/vllm/model_executor/layers/rotary_embedding.py +++ /dev/null @@ -1,378 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Rotary Positional Embeddings.""" -import math -from typing import Any, Dict, Optional, Tuple, Union - -import torch -import torch.nn as nn - -from vllm._C import ops - - -def _rotate_neox(x: torch.Tensor) -> torch.Tensor: - x1 = x[..., :x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2:] - return torch.cat((-x2, x1), dim=-1) - - -def _rotate_gptj(x: torch.Tensor) -> torch.Tensor: - x1 = x[..., ::2] - x2 = x[..., 1::2] - x = torch.stack((-x2, x1), dim=-1) - return x.flatten(-2) - - -class RotaryEmbedding(nn.Module): - """Original rotary positional embedding.""" - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - base: int, - is_neox_style: bool, - ) -> None: - super().__init__() - self.head_size = head_size - self.rotary_dim = rotary_dim - self.max_position_embeddings = max_position_embeddings - self.base = base - self.is_neox_style = is_neox_style - - cache = self._compute_cos_sin_cache() - cache = cache.to(torch.get_default_dtype()) - self.register_buffer("cos_sin_cache", cache, persistent=False) - - def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: - """Compute the inverse frequency.""" - # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`. - # However, we use `torch.arange(..., dtype=torch.float)` instead to - # avoid numerical issues with large base values (e.g., 10000000). - # This may cause a slight numerical difference between the HF - # implementation and ours. - # NOTE(woosuk): To exactly match the HF implementation, we need to - # use CPU to compute the cache and then move it to GPU. However, we - # create the cache on GPU for faster initialization. This may cause - # a slight numerical difference between the HF implementation and ours. - inv_freq = 1.0 / (base**(torch.arange( - 0, self.rotary_dim, 2, dtype=torch.float, device="cuda") / - self.rotary_dim)) - return inv_freq - - def _compute_cos_sin_cache(self) -> torch.Tensor: - """Compute the cos and sin cache.""" - inv_freq = self._compute_inv_freq(self.base) - t = torch.arange(self.max_position_embeddings, - dtype=torch.float, - device="cuda") - - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = freqs.cos() - sin = freqs.sin() - cache = torch.cat((cos, sin), dim=-1) - return cache - - def _forward( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """PyTorch-native implementation equivalent to forward().""" - query = query.view(*query.shape[:-1], -1, self.head_size) - key = key.view(*key.shape[:-1], -1, self.head_size) - - query_rot = query[..., :self.rotary_dim] - key_rot = key[..., :self.rotary_dim] - if self.rotary_dim < self.head_size: - query_pass = query[..., self.rotary_dim:] - key_pass = key[..., self.rotary_dim:] - - cos_sin = self.cos_sin_cache[positions] - cos, sin = cos_sin.chunk(2, dim=-1) - if self.is_neox_style: - # NOTE(woosuk): Here we assume that the positions tensor has the - # shape [batch_size, seq_len]. - cos = cos.repeat(1, 1, 2).unsqueeze(-2) - sin = sin.repeat(1, 1, 2).unsqueeze(-2) - else: - cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) - sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) - - rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj - query_rot = query_rot * cos + rotate_fn(query_rot) * sin - key_rot = key_rot * cos + rotate_fn(key_rot) * sin - - if self.rotary_dim < self.head_size: - query = torch.cat((query_rot, query_pass), dim=-1) - key = torch.cat((key_rot, key_pass), dim=-1) - else: - query = query_rot - key = key_rot - query = query.flatten(-2) - key = key.flatten(-2) - return query, key - - def forward( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # ops.rotary_embedding() is an in-place operation that - # updates the query and key tensors. - ops.rotary_embedding(positions, query, key, self.head_size, - self.cos_sin_cache, self.is_neox_style) - return query, key - - -class LinearScalingRotaryEmbedding(RotaryEmbedding): - """RotaryEmbedding extended with linear scaling. - - Credits to the Reddit user /u/kaiokendev - """ - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - base: int, - is_neox_style: bool, - scaling_factor: float, - ) -> None: - self.scaling_factor = scaling_factor - super().__init__(head_size, rotary_dim, max_position_embeddings, base, - is_neox_style) - - def _compute_cos_sin_cache(self) -> torch.Tensor: - inv_freq = self._compute_inv_freq(self.base) - # NOTE(woosuk): self.max_position_embeddings is the original - # maximum length before applying the rope scaling. - # Thus, the maximum length after applying the rope scaling is - # self.max_position_embeddings * self.scaling_factor. - max_len = self.max_position_embeddings * self.scaling_factor - t = torch.arange(max_len, dtype=torch.float, device="cuda") - t = t / self.scaling_factor - - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = freqs.cos() - sin = freqs.sin() - cache = torch.cat((cos, sin), dim=-1) - return cache - - -class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding): - """RotaryEmbedding extended with Dynamic NTK scaling. - - Credits to the Reddit users /u/bloc97 and /u/emozilla - """ - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - base: int, - is_neox_style: bool, - scaling_factor: float, - ) -> None: - self.scaling_factor = scaling_factor - super().__init__(head_size, rotary_dim, max_position_embeddings, base, - is_neox_style) - - def _compute_cos_sin_cache(self) -> torch.Tensor: - # NOTE(woosuk): self.max_position_embeddings is the original - # maximum length before applying the rope scaling. - # Thus, the maximum length after applying the rope scaling is - # self.max_position_embeddings * self.scaling_factor. - max_len = self.max_position_embeddings * self.scaling_factor - base = self.base * ( - (self.scaling_factor * max_len / self.max_position_embeddings) - - (self.scaling_factor - 1))**(self.rotary_dim / - (self.rotary_dim - 2)) - inv_freq = self._compute_inv_freq(base) - t = torch.arange(max_len, dtype=torch.float, device="cuda") - - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = freqs.cos() - sin = freqs.sin() - cache = torch.cat((cos, sin), dim=-1) - return cache - - -# Inverse dim formula to find dim based on number of rotations -def _yarn_find_correction_dim(num_rotations: int, - dim: int, - base: float = 10000, - max_position_embeddings: int = 2048) -> float: - return (dim * math.log(max_position_embeddings / - (num_rotations * 2 * math.pi))) / (2 * - math.log(base)) - - -# Find dim range bounds based on rotations -def _yarn_find_correction_range(low_rot: int, - high_rot: int, - dim: int, - base: float = 10000, - max_position_embeddings: int = 2048) -> int: - low = math.floor( - _yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)) - high = math.ceil( - _yarn_find_correction_dim(high_rot, dim, base, - max_position_embeddings)) - return max(low, 0), min(high, dim - 1) # Clamp values just in case - - -def _yarn_linear_ramp_mask(low: float, high: float, dim: int, - dtype: torch.dtype, - device: torch.device) -> torch.Tensor: - if low == high: - high += 0.001 # Prevent singularity - - linear_func = (torch.arange(dim, dtype=dtype, device=device) - - low) / (high - low) - ramp_func = torch.clamp(linear_func, 0, 1) - return ramp_func - - -def _yarn_get_mscale(scale: float = 1) -> float: - if scale <= 1: - return 1.0 - return 0.1 * math.log(scale) + 1.0 - - -class YaRNScalingRotaryEmbedding(RotaryEmbedding): - """RotaryEmbedding extended with YaRN method. - - Credits to Peng et al. github.com/jquesnelle/yarn - """ - - def __init__( - self, - head_size: int, - rotary_dim: int, - max_position_embeddings: int, - base: int, - is_neox_style: bool, - scaling_factor: float, - *, - extrapolation_factor: float = 1, - attn_factor: float = 1, - beta_fast: float = 32, - beta_slow: float = 1, - ) -> None: - self.scaling_factor = scaling_factor - self.extrapolation_factor = extrapolation_factor - self.attn_factor = attn_factor - self.beta_fast = beta_fast - self.beta_slow = beta_slow - # Get n-d magnitude scaling corrected for interpolation - self.mscale = float( - _yarn_get_mscale(self.scaling_factor) * attn_factor) - super().__init__(head_size, rotary_dim, max_position_embeddings, base, - is_neox_style) - - def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: - pos_freqs = self.base**(torch.arange( - 0, self.rotary_dim, 2, dtype=torch.float, device="cuda") / - self.rotary_dim) - inv_freq_extrapolation = 1.0 / pos_freqs - inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs) - - low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow, - self.rotary_dim, self.base, - self.max_position_embeddings) - # Get n-d rotational scaling corrected for extrapolation - inv_freq_mask = (1 - _yarn_linear_ramp_mask( - low, high, self.rotary_dim // 2, dtype=torch.float, - device="cuda")) * self.extrapolation_factor - inv_freq = inv_freq_interpolation * ( - 1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask - return inv_freq - - def _compute_cos_sin_cache(self) -> torch.Tensor: - inv_freq = self._compute_inv_freq(self.scaling_factor) - t = torch.arange(self.max_position_embeddings * self.scaling_factor, - device="cuda", - dtype=torch.float32) - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = (freqs.cos() * self.mscale) - sin = (freqs.sin() * self.mscale) - cache = torch.cat((cos, sin), dim=-1) - return cache - - -_ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {} - - -def get_rope( - head_size: int, - rotary_dim: int, - max_position: int, - base: int, - is_neox_style: bool = True, - rope_scaling: Optional[Dict[str, Any]] = None, -) -> RotaryEmbedding: - key = (head_size, rotary_dim, max_position, base, is_neox_style, - tuple(rope_scaling.items()) if rope_scaling is not None else None) - if key in _ROPE_DICT: - return _ROPE_DICT[key] - - if rope_scaling is None: - rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base, - is_neox_style) - else: - scaling_type = rope_scaling["type"] - scaling_factor = rope_scaling["factor"] - if scaling_type == "linear": - rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim, - max_position, base, - is_neox_style, - scaling_factor) - elif scaling_type == "dynamic": - rotary_emb = DynamicNTKScalingRotaryEmbedding( - head_size, rotary_dim, max_position, base, is_neox_style, - scaling_factor) - elif scaling_type == "yarn": - original_max_position = rope_scaling[ - "original_max_position_embeddings"] - assert max_position == original_max_position * scaling_factor - extra_kwargs = { - k: v - for k, v in rope_scaling.items() - if k in ("extrapolation_factor", "attn_factor", "beta_fast", - "beta_slow") - } - rotary_emb = YaRNScalingRotaryEmbedding(head_size, rotary_dim, - original_max_position, - base, is_neox_style, - scaling_factor, - **extra_kwargs) - else: - raise ValueError(f"Unknown RoPE scaling type {scaling_type}") - _ROPE_DICT[key] = rotary_emb - return rotary_emb diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py deleted file mode 100644 index bc86a916b5bbfd94f593838dd78749bb2f295fc9..0000000000000000000000000000000000000000 --- a/vllm/model_executor/layers/sampler.py +++ /dev/null @@ -1,561 +0,0 @@ -"""A layer that samples the next tokens from the model's outputs.""" -from typing import Dict, List, Optional, Tuple - -import torch -import torch.nn as nn - -from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_gather) -from vllm.model_executor.sampling_metadata import SamplingMetadata, SamplingTensors -from vllm.sampling_params import SamplingParams, SamplingType -from vllm.sequence import (PromptLogprobs, SampleLogprobs, SamplerOutput, - SequenceData, SequenceGroupOutput, SequenceOutput) - - -class Sampler(nn.Module): - """Samples the next tokens from the model's outputs. - - This layer does the following: - 1. Discard the hidden states that are not used for sampling (i.e., all - tokens except the final one in each prompt). - 2. Compute the logits for the next tokens. - 3. Apply presence, frequency and repetition penalties. - 4. Apply temperature scaling. - 5. Apply top-p and top-k truncation. - 6. Sample the next tokens. - Here, each sequence group within the batch can have different sampling - parameters (e.g., sampling method, temperature, top-p, top-k, etc.). - """ - - def __init__(self, - vocab_size: int, - org_vocab_size: Optional[int] = None) -> None: - super().__init__() - self.vocab_size = vocab_size - # original vocabulary size (without LoRA). - self.org_vocab_size = org_vocab_size or vocab_size - - def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor, - embedding_bias: Optional[torch.Tensor]) -> torch.Tensor: - # Get the logits for the next tokens. - logits = torch.matmul(hidden_states, embedding.t()) - if embedding_bias is not None: - logits += embedding_bias - logits = tensor_model_parallel_gather(logits) - # Remove paddings in vocab (if any). - if logits is not None: - logits = logits[:, :self.org_vocab_size] - return logits - - def forward( - self, - embedding: torch.Tensor, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - embedding_bias: Optional[torch.Tensor] = None, - ) -> Optional[SamplerOutput]: - # Get the hidden states that we use for sampling. - hidden_states = _prune_hidden_states(hidden_states, sampling_metadata) - - # Get the logits for the next tokens. - logits = self._get_logits(hidden_states, embedding, embedding_bias) - - # Only perform sampling in the driver worker. - # Note: `_get_logits` is still distributed across TP workers because - # the `embedding` weight is distributed across TP workers. - # TODO(zhuohan): Change the get_logits part to a separate stage. - if not sampling_metadata.perform_sampling: - return None - - assert logits is not None - _, vocab_size = logits.shape - - # Apply logits processors (if any). - logits = _apply_logits_processors(logits, sampling_metadata) - - # Prepare sampling tensors with pinned memory to avoid blocking. - (sampling_tensors, do_penalties, do_top_p_top_k, - do_min_p) = SamplingTensors.from_sampling_metadata( - sampling_metadata, vocab_size, logits.device, logits.dtype) - - # Apply presence and frequency penalties. - if do_penalties: - logits = _apply_penalties(logits, sampling_tensors.prompt_tokens, - sampling_tensors.output_tokens, - sampling_tensors.presence_penalties, - sampling_tensors.frequency_penalties, - sampling_tensors.repetition_penalties) - - # Apply temperature scaling. - # Use in-place division to avoid creating a new tensor. - logits.div_(sampling_tensors.temperatures.unsqueeze_(dim=1)) - - if do_top_p_top_k: - logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, - sampling_tensors.top_ks) - - if do_min_p: - logits = _apply_min_p(logits, sampling_tensors.min_ps) - - # We use float32 for probabilities and log probabilities. - # Compute the probabilities. - probs = torch.softmax(logits, dim=-1, dtype=torch.float) - # Compute the log probabilities. - # Use log_softmax to ensure numerical stability. - logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float) - - # Sample the next tokens. - sample_results = _sample(probs, logprobs, sampling_metadata) - # Get the logprobs query results. - prompt_logprobs, sample_logprobs = _get_logprobs( - logprobs, sampling_metadata, sample_results) - return _build_sampler_output(sample_results, sampling_metadata, - prompt_logprobs, sample_logprobs) - - -def _prune_hidden_states( - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, -) -> torch.Tensor: - hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) - return hidden_states.index_select(0, - sampling_metadata.selected_token_indices) - - -def _get_bin_counts_and_mask( - tokens: torch.Tensor, - vocab_size: int, - num_seqs: int, -) -> Tuple[torch.Tensor, torch.Tensor]: - # Compute the bin counts for the tokens. - # vocab_size + 1 for padding. - bin_counts = torch.zeros((num_seqs, vocab_size + 1), - dtype=torch.long, - device=tokens.device) - bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens)) - bin_counts = bin_counts[:, :vocab_size] - mask = bin_counts > 0 - - return bin_counts, mask - - -def _apply_logits_processors( - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, -) -> torch.Tensor: - logits_row_idx = 0 - found_logits_processors = False - for seq_ids, sampling_params in sampling_metadata.seq_groups: - logits_processors = sampling_params.logits_processors - if logits_processors: - found_logits_processors = True - for seq_id in seq_ids: - logits_row = logits[logits_row_idx] - token_ids = sampling_metadata.seq_data[seq_id].output_token_ids - for logits_processor in logits_processors: - logits_row = logits_processor(token_ids, logits_row) - logits[logits_row_idx] = logits_row - logits_row_idx += 1 - else: - logits_row_idx += len(seq_ids) - if found_logits_processors: - assert logits_row_idx == logits.shape[0] - return logits - - -def _apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor, - output_tokens_tensor: torch.Tensor, - presence_penalties: torch.Tensor, - frequency_penalties: torch.Tensor, - repetition_penalties: torch.Tensor) -> torch.Tensor: - num_seqs, vocab_size = logits.shape - _, prompt_mask = _get_bin_counts_and_mask(prompt_tokens_tensor, vocab_size, - num_seqs) - output_bin_counts, output_mask = _get_bin_counts_and_mask( - output_tokens_tensor, vocab_size, num_seqs) - - repetition_penalties = repetition_penalties[:, None].repeat(1, vocab_size) - repetition_penalties[~(prompt_mask | output_mask)] = 1.0 - logits = torch.where(logits > 0, logits / repetition_penalties, - logits * repetition_penalties) - - # We follow the definition in OpenAI API. - # Refer to https://platform.openai.com/docs/api-reference/parameter-details - logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts - logits -= presence_penalties.unsqueeze_(dim=1) * output_mask - return logits - - -def _apply_top_k_top_p( - logits: torch.Tensor, - p: torch.Tensor, - k: torch.Tensor, -) -> torch.Tensor: - logits_sort, logits_idx = logits.sort(dim=-1, descending=False) - - # Apply top-k. - top_k_mask = logits_sort.size(1) - k.to(torch.long) - # Get all the top_k values. - top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1)) - top_k_mask = logits_sort < top_k_mask - logits_sort.masked_fill_(top_k_mask, -float("inf")) - - # Apply top-p. - probs_sort = logits_sort.softmax(dim=-1) - probs_sum = probs_sort.cumsum(dim=-1) - top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1) - # at least one - top_p_mask[:, -1] = False - logits_sort.masked_fill_(top_p_mask, -float("inf")) - - # Re-sort the probabilities. - src = torch.arange(logits_idx.shape[-1], - device=logits_idx.device).expand_as(logits_idx) - logits_idx_inv = torch.empty_like(logits_idx).scatter_(dim=-1, - index=logits_idx, - src=src) - logits = torch.gather(logits_sort, dim=-1, index=logits_idx_inv) - return logits - - -def _apply_min_p( - logits: torch.Tensor, - min_p: torch.Tensor, -) -> torch.Tensor: - """ - Adapted from - https://github.com/oobabooga/text-generation-webui/blob/3146124ec01f02c8fb1650a6517cf1b60b537aaf/modules/sampler_hijack.py#L16C17-L16C17 - """ - probs = torch.softmax(logits, dim=-1) - top_probs, _ = probs.max(dim=-1, keepdim=True) - scaled_min_p = min_p.unsqueeze_(dim=1) * top_probs - tokens_to_remove = probs < scaled_min_p - logits = logits.masked_fill_(tokens_to_remove, -float("inf")) - - return logits - - -def _greedy_sample( - selected_seq_groups: List[Tuple[List[int], SamplingParams]], - samples: torch.Tensor, -) -> List[Tuple[List[int], List[int]]]: - samples = samples.tolist() - sample_idx = 0 - results = [] - for seq_group in selected_seq_groups: - seq_ids, _ = seq_group - num_parent_seqs = len(seq_ids) - assert num_parent_seqs == 1, ( - "Greedy sampling should have only one seq.") - parent_ids = list(range(num_parent_seqs)) - next_token_ids = [samples[sample_idx]] - results.append((next_token_ids, parent_ids)) - sample_idx += num_parent_seqs - return results - - -def _random_sample( - selected_seq_groups: List[Tuple[List[int], SamplingParams]], - is_prompts: List[bool], - random_samples: torch.Tensor, -) -> List[Tuple[List[int], List[int]]]: - # Find the maximum best_of value of the prompt phase requests. - random_samples = random_samples.cpu() - sample_idx = 0 - results = [] - for seq_group, is_prompt in zip(selected_seq_groups, is_prompts): - seq_ids, sampling_params = seq_group - num_parent_seqs = len(seq_ids) - if is_prompt: - # Prompt phase. - parent_ids = [0] * sampling_params.best_of - next_token_ids = random_samples[ - sample_idx, :sampling_params.best_of].tolist() - else: - # Generation phase. - parent_ids = list(range(num_parent_seqs)) - next_token_ids = random_samples[sample_idx:sample_idx + - num_parent_seqs, 0].tolist() - results.append((next_token_ids, parent_ids)) - sample_idx += num_parent_seqs - return results - - -def _beam_search_sample( - selected_seq_groups: List[Tuple[List[int], SamplingParams]], - is_prompts: List[bool], - seq_data: Dict[int, SequenceData], - logprobs: torch.Tensor, -) -> List[Tuple[List[int], List[int]]]: - # We sample 2 * beam_width candidates to make sure that with high - # probability we can get `beam_width` candidates in addition to - # the finished sequences for the next iteration. See - # https://github.com/tensorflow/tensor2tensor/blob/bafdc1b67730430d38d6ab802cbd51f9d053ba2e/tensor2tensor/utils/beam_search.py#L557-L563 - # for details. See also HF reference: - # https://github.com/huggingface/transformers/blob/a4dd53d88e4852f023332d284ff07a01afcd5681/src/transformers/generation/utils.py#L3063-L3065 - # - # NOTE: Beam search is not vectorized, so its speed can be slower than - # other sampling methods. - sample_idx = 0 - results = [] - for seq_group, is_prompt in zip(selected_seq_groups, is_prompts): - seq_ids, sampling_params = seq_group - num_parent_seqs = len(seq_ids) - beam_width = sampling_params.best_of - seq_group_logprobs = logprobs[sample_idx:sample_idx + num_parent_seqs] - if is_prompt: - # Prompt phase. - assert num_parent_seqs == 1, ( - "Prompt input should have only one seq.") - parent_ids = [0] * (2 * beam_width) - _, next_token_ids = torch.topk(seq_group_logprobs[0], - 2 * beam_width) - next_token_ids = next_token_ids.tolist() - else: - # Generation phase. - cumulative_logprobs = [ - seq_data[seq_id].cumulative_logprob for seq_id in seq_ids - ] - cumulative_logprobs = torch.tensor( - cumulative_logprobs, - dtype=torch.float, - device=seq_group_logprobs.device) - seq_group_logprobs = (seq_group_logprobs + - cumulative_logprobs.unsqueeze(dim=1)) - _, topk_ids = torch.topk(seq_group_logprobs.flatten(), - 2 * beam_width) - topk_ids = topk_ids.tolist() - vocab_size = seq_group_logprobs.size(-1) - parent_ids = [i // vocab_size for i in topk_ids] - next_token_ids = [i % vocab_size for i in topk_ids] - results.append((next_token_ids, parent_ids)) - sample_idx += num_parent_seqs - assert sample_idx == logprobs.size(0) - return results - - -# torch.multinomial forces a GPU<->CPU sync. -# Therefore, we use an optimized implementation instead. -# Note that we always sample with replacement. -# probs will be modified in place, but this is fine, as we pass -# in a copy already. -def _multinomial( - probs: torch.Tensor, - num_samples: int, -): - if num_samples > 1: - # This is equivalent to torch.repeat_interleaved (which also - # forces a GPU<->CPU sync). - # This allows us to do sampling with replacement by creating - # num_samples copies of each row in the tensor, and then - # batch sampling the resulting tensor. - probs = probs[:, None, :].expand(probs.shape[0], num_samples, - probs.shape[1]).contiguous().view( - -1, probs.shape[1]) - q = torch.empty_like(probs).exponential_(1) - return probs.div_(q).argmax(dim=1).view(-1, num_samples) - - -def _sample( - probs: torch.Tensor, - logprobs: torch.Tensor, - sampling_metadata: SamplingMetadata, -) -> List[Tuple[List[int], List[int]]]: - categorized_seq_group_ids = {t: [] for t in SamplingType} - categorized_sample_indices = sampling_metadata.categorized_sample_indices - for i, seq_group in enumerate(sampling_metadata.seq_groups): - _, sampling_params = seq_group - sampling_type = sampling_params.sampling_type - categorized_seq_group_ids[sampling_type].append(i) - - sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {} - sample_metadata = {} - - # Counterintiutively, having two loops here is actually faster. - # The first loop can run without waiting on GPU<->CPU sync. - for sampling_type in SamplingType: - sample_indices = categorized_sample_indices[sampling_type] - num_tokens = len(sample_indices) - if num_tokens == 0: - continue - seq_group_ids = categorized_seq_group_ids[sampling_type] - seq_groups = [sampling_metadata.seq_groups[i] for i in seq_group_ids] - is_prompts = [i < sampling_metadata.num_prompts for i in seq_group_ids] - sample_metadata[sampling_type] = (seq_group_ids, seq_groups, - is_prompts, sample_indices) - if sampling_type == SamplingType.GREEDY: - greedy_samples = torch.argmax(logprobs[sample_indices], dim=-1) - elif sampling_type == SamplingType.RANDOM: - max_best_of = 1 - for seq_group, is_prompt in zip(seq_groups, is_prompts): - if is_prompt: - _, sampling_params = seq_group - max_best_of = max(max_best_of, sampling_params.best_of) - multinomial_samples = _multinomial(probs[sample_indices], - max_best_of) - elif sampling_type == SamplingType.BEAM: - beam_search_logprobs = logprobs[sample_indices] - else: - raise ValueError(f"Unsupported sampling type: {sampling_type}") - - # GPU<->CPU sync happens in the loop below. - - for sampling_type in SamplingType: - if sampling_type not in sample_metadata: - continue - seq_group_ids, seq_groups, is_prompts, sample_indices = sample_metadata[ - sampling_type] - if sampling_type == SamplingType.GREEDY: - sample_results = _greedy_sample(seq_groups, greedy_samples) - elif sampling_type == SamplingType.RANDOM: - sample_results = _random_sample(seq_groups, is_prompts, - multinomial_samples) - elif sampling_type == SamplingType.BEAM: - sample_results = _beam_search_sample(seq_groups, is_prompts, - sampling_metadata.seq_data, - beam_search_logprobs) - sample_results_dict.update(zip(seq_group_ids, sample_results)) - - sample_results = [ - sample_results_dict[i] - for i in range(len(sampling_metadata.seq_groups)) - ] - return sample_results - - -def _get_logprobs( - logprobs: torch.Tensor, - sampling_metadata: SamplingMetadata, - sample_results: List[Tuple[List[int], List[int]]], -) -> Tuple[List[Optional[List[Optional[Dict[int, float]]]]], List[List[Dict[ - int, float]]]]: - # Prepare query indices - batched_logprobs_query_seq_indices: List[int] = [] - batched_logprobs_query_token_indices: List[int] = [] - largest_num_logprobs = 0 - sample_idx = 0 - for i, (seq_group, sample_result) in enumerate( - zip(sampling_metadata.seq_groups, sample_results)): - seq_ids, sampling_params = seq_group - next_token_ids, parent_ids = sample_result - num_parent_seqs = len(seq_ids) - if (i < sampling_metadata.num_prompts - and sampling_params.prompt_logprobs is not None): - largest_num_logprobs = max(largest_num_logprobs, - sampling_params.prompt_logprobs) - prompt_len = sampling_metadata.prompt_lens[i] - prompt_tokens = sampling_metadata.seq_data[ - seq_ids[0]].prompt_token_ids - batched_logprobs_query_seq_indices.extend( - sample_idx + j for j in range(prompt_len - 1)) - batched_logprobs_query_token_indices.extend( - token_id for token_id in prompt_tokens[1:]) - sample_idx += prompt_len - 1 - batched_logprobs_query_seq_indices.extend( - [sample_idx + parent_id for parent_id in parent_ids]) - batched_logprobs_query_token_indices.extend(next_token_ids) - if sampling_params.logprobs is not None: - largest_num_logprobs = max(largest_num_logprobs, - sampling_params.logprobs) - sample_idx += num_parent_seqs - assert sample_idx == logprobs.size(0) - - # Batched query for logprobs of selected token - batched_logprobs_query_result = logprobs[[ - batched_logprobs_query_seq_indices, - batched_logprobs_query_token_indices - ]] - - # Batched query for logprobs of topk tokens - if largest_num_logprobs > 0: - top_logprobs, top_token_ids = torch.topk(logprobs, - largest_num_logprobs, - dim=-1) - top_logprobs = top_logprobs.cpu() - top_token_ids = top_token_ids.cpu() - else: - top_logprobs, top_token_ids = None, None - - batched_logprobs_query_result = batched_logprobs_query_result.cpu() - - # Gather results - result_prompt_logprobs: List[Optional[PromptLogprobs]] = [] - result_sample_logprobs: List[SampleLogprobs] = [] - sample_idx = 0 - query_result_idx = 0 - for i, (seq_group, sample_result) in enumerate( - zip(sampling_metadata.seq_groups, sample_results)): - seq_ids, sampling_params = seq_group - next_token_ids, parent_ids = sample_result - - # Prompt logprobs - if (i < sampling_metadata.num_prompts - and sampling_params.prompt_logprobs is not None): - num_logprobs = sampling_params.prompt_logprobs - prompt_len = sampling_metadata.prompt_lens[i] - prompt_tokens = sampling_metadata.seq_data[ - seq_ids[0]].prompt_token_ids - group_prompt_logprobs: PromptLogprobs = [None] - for token_id in prompt_tokens[1:]: - prompt_logprobs_dict = { - token_id: - batched_logprobs_query_result[query_result_idx].item() - } - if num_logprobs > 0: - prompt_logprobs_dict.update( - zip(top_token_ids[sample_idx, :num_logprobs].tolist(), - top_logprobs[sample_idx, :num_logprobs].tolist())) - group_prompt_logprobs.append(prompt_logprobs_dict) - sample_idx += 1 - query_result_idx += 1 - result_prompt_logprobs.append(group_prompt_logprobs) - else: - result_prompt_logprobs.append(None) - - # Sample logprobs - num_logprobs = sampling_params.logprobs - if num_logprobs is None: - num_logprobs = 0 - group_sample_logprobs: SampleLogprobs = [] - for next_token_id, parent_id in zip(next_token_ids, parent_ids): - sample_logprobs_dict = { - next_token_id: - batched_logprobs_query_result[query_result_idx].item() - } - query_result_idx += 1 - if num_logprobs > 0: - sample_logprobs_dict.update( - zip( - top_token_ids[sample_idx + - parent_id, :num_logprobs].tolist(), - top_logprobs[sample_idx + - parent_id, :num_logprobs].tolist())) - group_sample_logprobs.append(sample_logprobs_dict) - result_sample_logprobs.append(group_sample_logprobs) - sample_idx += len(seq_ids) - - return result_prompt_logprobs, result_sample_logprobs - - -def _build_sampler_output( - sample_results: List[Tuple[List[int], List[int]]], - sampling_metadata: SamplingMetadata, - prompt_logprobs: List[Optional[PromptLogprobs]], - sample_logprobs: List[SampleLogprobs], -) -> SamplerOutput: - sampler_output = [] - for (seq_group, sample_result, group_prompt_logprobs, - group_sample_logprobs) in zip(sampling_metadata.seq_groups, - sample_results, prompt_logprobs, - sample_logprobs): - seq_ids, _ = seq_group - next_token_ids, parent_ids = sample_result - seq_outputs = [] - for parent_id, next_token_id, logprobs in zip(parent_ids, - next_token_ids, - group_sample_logprobs): - seq_outputs.append( - SequenceOutput(seq_ids[parent_id], next_token_id, logprobs)) - sampler_output.append( - SequenceGroupOutput(seq_outputs, group_prompt_logprobs)) - return sampler_output diff --git a/vllm/model_executor/layers/triton_kernel/__init__.py b/vllm/model_executor/layers/triton_kernel/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py deleted file mode 100644 index 8fa70054f02ca8f0449bf3c34f10cd66a54773de..0000000000000000000000000000000000000000 --- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py +++ /dev/null @@ -1,728 +0,0 @@ -# The kernels in this file are adapted from LightLLM's context_attention_fwd: -# https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py - -import torch -import triton -import triton.language as tl - -if triton.__version__ >= "2.1.0": - - @triton.jit - def _fwd_kernel( - Q, - K, - V, - K_cache, - V_cache, - B_Loc, - sm_scale, - B_Start_Loc, - B_Seqlen, - B_Ctxlen, - block_size, - x, - Out, - stride_b_loc_b, - stride_b_loc_s, - stride_qbs, - stride_qh, - stride_qd, - stride_kbs, - stride_kh, - stride_kd, - stride_vbs, - stride_vh, - stride_vd, - stride_obs, - stride_oh, - stride_od, - stride_k_cache_bs, - stride_k_cache_h, - stride_k_cache_d, - stride_k_cache_bl, - stride_k_cache_x, - stride_v_cache_bs, - stride_v_cache_h, - stride_v_cache_d, - stride_v_cache_bl, - BLOCK_M: tl.constexpr, - BLOCK_DMODEL: tl.constexpr, - BLOCK_N: tl.constexpr, - ): - cur_batch = tl.program_id(0) - cur_head = tl.program_id(1) - start_m = tl.program_id(2) - - cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch) - cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) - cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) - - block_start_loc = BLOCK_M * start_m - - # initialize offsets - offs_n = tl.arange(0, BLOCK_N) - offs_d = tl.arange(0, BLOCK_DMODEL) - offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) - off_q = ( - (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + - cur_head * stride_qh + offs_d[None, :] * stride_qd) - - q = tl.load( - Q + off_q, - mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len, - other=0.0) - - # # initialize pointer to m and l - m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") - l_i = tl.zeros([BLOCK_M], dtype=tl.float32) - acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - - for start_n in range(0, cur_batch_ctx_len, BLOCK_N): - start_n = tl.multiple_of(start_n, BLOCK_N) - # -- compute qk ---- - bn = tl.load(B_Loc + cur_batch * stride_b_loc_b + - ((start_n + offs_n) // block_size) * stride_b_loc_s, - mask=(start_n + offs_n) < cur_batch_ctx_len, - other=0) - off_k = (bn[None, :] * stride_k_cache_bs + - cur_head * stride_k_cache_h + - (offs_d[:, None] // x) * stride_k_cache_d + - ((start_n + offs_n[None, :]) % block_size) * - stride_k_cache_bl + - (offs_d[:, None] % x) * stride_k_cache_x) - off_v = ( - bn[:, None] * stride_v_cache_bs + cur_head * stride_v_cache_h + - offs_d[None, :] * stride_v_cache_d + - (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl) - k = tl.load(K_cache + off_k, - mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len, - other=0.0) - - qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - qk += tl.dot(q, k) - qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk, - float("-inf")) - qk *= sm_scale - - # -- compute m_ij, p, l_ij - m_ij = tl.max(qk, 1) - p = tl.exp(qk - m_ij[:, None]) - l_ij = tl.sum(p, 1) - # -- update m_i and l_i - m_i_new = tl.maximum(m_i, m_ij) - alpha = tl.exp(m_i - m_i_new) - beta = tl.exp(m_ij - m_i_new) - l_i_new = alpha * l_i + beta * l_ij - # -- update output accumulator -- - # scale p - p_scale = beta / l_i_new - p = p * p_scale[:, None] - # scale acc - acc_scale = l_i / l_i_new * alpha - acc = acc * acc_scale[:, None] - # update acc - v = tl.load(V_cache + off_v, - mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len, - other=0.0) - - p = p.to(v.dtype) - acc += tl.dot(p, v) - # # update m_i and l_i - l_i = l_i_new - m_i = m_i_new - - off_k = (offs_n[None, :] * stride_kbs + cur_head * stride_kh + - offs_d[:, None] * stride_kd) - off_v = (offs_n[:, None] * stride_vbs + cur_head * stride_vh + - offs_d[None, :] * stride_vd) - k_ptrs = K + off_k - v_ptrs = V + off_v - - block_mask = tl.where( - block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0) - - for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N): - start_n = tl.multiple_of(start_n, BLOCK_N) - # -- compute qk ---- - k = tl.load(k_ptrs + - (cur_batch_in_all_start_index + start_n) * stride_kbs, - mask=(start_n + offs_n[None, :]) < - cur_batch_seq_len - cur_batch_ctx_len, - other=0.0) - - qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - qk += tl.dot(q, k) - qk *= sm_scale - qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, - float("-inf")) - - # -- compute m_ij, p, l_ij - m_ij = tl.max(qk, 1) - p = tl.exp(qk - m_ij[:, None]) - l_ij = tl.sum(p, 1) - # -- update m_i and l_i - m_i_new = tl.maximum(m_i, m_ij) - alpha = tl.exp(m_i - m_i_new) - beta = tl.exp(m_ij - m_i_new) - l_i_new = alpha * l_i + beta * l_ij - # -- update output accumulator -- - # scale p - p_scale = beta / l_i_new - p = p * p_scale[:, None] - # scale acc - acc_scale = l_i / l_i_new * alpha - acc = acc * acc_scale[:, None] - # update acc - v = tl.load(v_ptrs + - (cur_batch_in_all_start_index + start_n) * stride_vbs, - mask=(start_n + offs_n[:, None]) < - cur_batch_seq_len - cur_batch_ctx_len, - other=0.0) - - p = p.to(v.dtype) - acc += tl.dot(p, v) - # update m_i and l_i - l_i = l_i_new - m_i = m_i_new - # initialize pointers to output - off_o = ( - (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + - cur_head * stride_oh + offs_d[None, :] * stride_od) - out_ptrs = Out + off_o - tl.store(out_ptrs, - acc, - mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len) - return - - @triton.jit - def _fwd_kernel_flash_attn_v2( - Q, - K, - V, - K_cache, - V_cache, - B_Loc, - sm_scale, - B_Start_Loc, - B_Seqlen, - B_Ctxlen, - block_size, - x, - Out, - stride_b_loc_b, - stride_b_loc_s, - stride_qbs, - stride_qh, - stride_qd, - stride_kbs, - stride_kh, - stride_kd, - stride_vbs, - stride_vh, - stride_vd, - stride_obs, - stride_oh, - stride_od, - stride_k_cache_bs, - stride_k_cache_h, - stride_k_cache_d, - stride_k_cache_bl, - stride_k_cache_x, - stride_v_cache_bs, - stride_v_cache_h, - stride_v_cache_d, - stride_v_cache_bl, - BLOCK_M: tl.constexpr, - BLOCK_DMODEL: tl.constexpr, - BLOCK_N: tl.constexpr, - ): - cur_batch = tl.program_id(0) - cur_head = tl.program_id(1) - start_m = tl.program_id(2) - - cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch) - cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) - cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) - - block_start_loc = BLOCK_M * start_m - - # initialize offsets - offs_n = tl.arange(0, BLOCK_N) - offs_d = tl.arange(0, BLOCK_DMODEL) - offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) - off_q = ( - (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + - cur_head * stride_qh + offs_d[None, :] * stride_qd) - - q = tl.load( - Q + off_q, - mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len, - other=0.0) - - # # initialize pointer to m and l - m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") - l_i = tl.zeros([BLOCK_M], dtype=tl.float32) - acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - - for start_n in range(0, cur_batch_ctx_len, BLOCK_N): - start_n = tl.multiple_of(start_n, BLOCK_N) - # -- compute qk ---- - bn = tl.load(B_Loc + cur_batch * stride_b_loc_b + - ((start_n + offs_n) // block_size) * stride_b_loc_s, - mask=(start_n + offs_n) < cur_batch_ctx_len, - other=0) - off_k = (bn[None, :] * stride_k_cache_bs + - cur_head * stride_k_cache_h + - (offs_d[:, None] // x) * stride_k_cache_d + - ((start_n + offs_n[None, :]) % block_size) * - stride_k_cache_bl + - (offs_d[:, None] % x) * stride_k_cache_x) - off_v = ( - bn[:, None] * stride_v_cache_bs + cur_head * stride_v_cache_h + - offs_d[None, :] * stride_v_cache_d + - (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl) - k = tl.load(K_cache + off_k, - mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len, - other=0.0) - - qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - qk += tl.dot(q, k) - qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk, - float("-inf")) - qk *= sm_scale - - # -- compute m_ij, p, l_ij - m_ij = tl.max(qk, 1) - m_i_new = tl.maximum(m_i, m_ij) - p = tl.math.exp(qk - m_i_new[:, None]) - l_ij = tl.sum(p, 1) - # -- update m_i and l_i - - alpha = tl.math.exp(m_i - m_i_new) - l_i_new = alpha * l_i + l_ij - # -- update output accumulator -- - # scale p - # scale acc - acc_scale = alpha - # acc_scale = l_i / l_i_new * alpha - acc = acc * acc_scale[:, None] - # update acc - v = tl.load(V_cache + off_v, - mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len, - other=0.0) - - p = p.to(v.dtype) - acc += tl.dot(p, v) - # update m_i and l_i - l_i = l_i_new - m_i = m_i_new - - off_k = (offs_n[None, :] * stride_kbs + cur_head * stride_kh + - offs_d[:, None] * stride_kd) - off_v = (offs_n[:, None] * stride_vbs + cur_head * stride_vh + - offs_d[None, :] * stride_vd) - k_ptrs = K + off_k - v_ptrs = V + off_v - - block_mask = tl.where( - block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0) - - for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N): - start_n = tl.multiple_of(start_n, BLOCK_N) - # -- compute qk ---- - k = tl.load(k_ptrs + - (cur_batch_in_all_start_index + start_n) * stride_kbs, - mask=(start_n + offs_n[None, :]) < - cur_batch_seq_len - cur_batch_ctx_len, - other=0.0) - - qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - qk += tl.dot(q, k) - qk *= sm_scale - qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, - float("-inf")) - - # -- compute m_ij, p, l_ij - m_ij = tl.max(qk, 1) - m_i_new = tl.maximum(m_i, m_ij) - p = tl.math.exp(qk - m_i_new[:, None]) - l_ij = tl.sum(p, 1) - # -- update m_i and l_i - - alpha = tl.math.exp(m_i - m_i_new) - l_i_new = alpha * l_i + l_ij - # -- update output accumulator -- - # scale p - # scale acc - acc_scale = alpha - # acc_scale = l_i / l_i_new * alpha - acc = acc * acc_scale[:, None] - # update acc - v = tl.load(v_ptrs + - (cur_batch_in_all_start_index + start_n) * stride_vbs, - mask=(start_n + offs_n[:, None]) < - cur_batch_seq_len - cur_batch_ctx_len, - other=0.0) - - p = p.to(v.dtype) - acc += tl.dot(p, v) - # update m_i and l_i - l_i = l_i_new - m_i = m_i_new - - # acc /= l_i[:, None] - # initialize pointers to output - off_o = ( - (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + - cur_head * stride_oh + offs_d[None, :] * stride_od) - out_ptrs = Out + off_o - tl.store(out_ptrs, - acc, - mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len) - return - - @triton.jit - def _fwd_kernel_alibi( - Q, - K, - V, - K_cache, - V_cache, - B_Loc, - sm_scale, - B_Start_Loc, - B_Seqlen, - B_Ctxlen, - Alibi_slopes, - block_size, - x, - Out, - stride_b_loc_b, - stride_b_loc_s, - stride_qbs, - stride_qh, - stride_qd, - stride_kbs, - stride_kh, - stride_kd, - stride_vbs, - stride_vh, - stride_vd, - stride_obs, - stride_oh, - stride_od, - stride_k_cache_bs, - stride_k_cache_h, - stride_k_cache_d, - stride_k_cache_bl, - stride_k_cache_x, - stride_v_cache_bs, - stride_v_cache_h, - stride_v_cache_d, - stride_v_cache_bl, - BLOCK_M: tl.constexpr, - BLOCK_DMODEL: tl.constexpr, - BLOCK_N: tl.constexpr, - ): - # attn_bias[] - cur_batch = tl.program_id(0) - cur_head = tl.program_id(1) - start_m = tl.program_id(2) - - # cur_batch_seq_len: the length of prompts - # cur_batch_ctx_len: the length of prefix - # cur_batch_in_all_start_index: the start id of the dim=0 - cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch) - cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) - cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) - - block_start_loc = BLOCK_M * start_m - - # initialize offsets - offs_n = tl.arange(0, BLOCK_N) - offs_d = tl.arange(0, BLOCK_DMODEL) - offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) - off_q = ( - (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + - cur_head * stride_qh + offs_d[None, :] * stride_qd) - - q = tl.load( - Q + off_q, - mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len, - other=0.0) - - # # initialize pointer to m and l - m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") - l_i = tl.zeros([BLOCK_M], dtype=tl.float32) - acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - - alibi_slope = tl.load(Alibi_slopes + cur_head) - alibi_start_q = tl.arange( - 0, BLOCK_M) + block_start_loc + cur_batch_ctx_len - alibi_start_k = 0 - for start_n in range(0, cur_batch_ctx_len, BLOCK_N): - start_n = tl.multiple_of(start_n, BLOCK_N) - # -- compute qk ---- - bn = tl.load(B_Loc + cur_batch * stride_b_loc_b + - ((start_n + offs_n) // block_size) * stride_b_loc_s, - mask=(start_n + offs_n) < cur_batch_ctx_len, - other=0) - off_k = (bn[None, :] * stride_k_cache_bs + - cur_head * stride_k_cache_h + - (offs_d[:, None] // x) * stride_k_cache_d + - ((start_n + offs_n[None, :]) % block_size) * - stride_k_cache_bl + - (offs_d[:, None] % x) * stride_k_cache_x) - off_v = ( - bn[:, None] * stride_v_cache_bs + cur_head * stride_v_cache_h + - offs_d[None, :] * stride_v_cache_d + - (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl) - k = tl.load(K_cache + off_k, - mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len, - other=0.0) - - qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - qk += tl.dot(q, k) - qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk, - float("-inf")) - qk *= sm_scale - - # load alibi - alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k - - alibi_start_q[:, None]) * alibi_slope - alibi = tl.where( - (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), - alibi, float("-inf")) - qk += alibi - alibi_start_k += BLOCK_N - - # -- compute m_ij, p, l_ij - m_ij = tl.max(qk, 1) - m_i_new = tl.maximum(m_i, m_ij) - p = tl.math.exp(qk - m_i_new[:, None]) - l_ij = tl.sum(p, 1) - # -- update m_i and l_i - - alpha = tl.math.exp(m_i - m_i_new) - l_i_new = alpha * l_i + l_ij - # -- update output accumulator -- - # scale p - # scale acc - acc_scale = alpha - # acc_scale = l_i / l_i_new * alpha - acc = acc * acc_scale[:, None] - # update acc - v = tl.load(V_cache + off_v, - mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len, - other=0.0) - - p = p.to(v.dtype) - acc += tl.dot(p, v, allow_tf32=False) - # update m_i and l_i - l_i = l_i_new - m_i = m_i_new - - off_k = (offs_n[None, :] * stride_kbs + cur_head * stride_kh + - offs_d[:, None] * stride_kd) - off_v = (offs_n[:, None] * stride_vbs + cur_head * stride_vh + - offs_d[None, :] * stride_vd) - k_ptrs = K + off_k - v_ptrs = V + off_v - - block_mask = tl.where( - block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0) - - # init alibi - alibi_slope = tl.load(Alibi_slopes + cur_head) - alibi_start_q = tl.arange( - 0, BLOCK_M) + block_start_loc + cur_batch_ctx_len - alibi_start_k = cur_batch_ctx_len - # # init debuger - # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc - # offset_db_k = tl.arange(0, BLOCK_N) - # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL] - for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N): - start_n = tl.multiple_of(start_n, BLOCK_N) - # -- compute qk ---- - k = tl.load(k_ptrs + - (cur_batch_in_all_start_index + start_n) * stride_kbs, - mask=(start_n + offs_n[None, :]) < - cur_batch_seq_len - cur_batch_ctx_len, - other=0.0) - - qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - qk += tl.dot(q, k, allow_tf32=False) - qk *= sm_scale - qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, - float("-inf")) - - # load alibi - alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k - - alibi_start_q[:, None]) * alibi_slope - alibi = tl.where( - (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), - alibi, float("-inf")) - qk += alibi - alibi_start_k += BLOCK_N - - # -- compute m_ij, p, l_ij - m_ij = tl.max(qk, 1) - m_i_new = tl.maximum(m_i, m_ij) - p = tl.math.exp(qk - m_i_new[:, None]) - l_ij = tl.sum(p, 1) - # -- update m_i and l_i - - alpha = tl.math.exp(m_i - m_i_new) - l_i_new = alpha * l_i + l_ij - # -- update output accumulator -- - # scale p - # scale acc - acc_scale = alpha - # acc_scale = l_i / l_i_new * alpha - acc = acc * acc_scale[:, None] - # update acc - v = tl.load(v_ptrs + - (cur_batch_in_all_start_index + start_n) * stride_vbs, - mask=(start_n + offs_n[:, None]) < - cur_batch_seq_len - cur_batch_ctx_len, - other=0.0) - - p = p.to(v.dtype) - acc += tl.dot(p, v, allow_tf32=False) - # update m_i and l_i - l_i = l_i_new - m_i = m_i_new - - acc = acc / l_i[:, None] - - # initialize pointers to output - off_o = ( - (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + - cur_head * stride_oh + offs_d[None, :] * stride_od) - out_ptrs = Out + off_o - tl.store(out_ptrs, - acc, - mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len) - return - - @torch.inference_mode() - def context_attention_fwd(q, - k, - v, - o, - k_cache, - v_cache, - b_loc, - b_start_loc, - b_seq_len, - b_ctx_len, - max_input_len, - alibi_slopes=None): - BLOCK = 128 - # shape constraints - Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] - assert Lq == Lk and Lk == Lv - assert Lk in {16, 32, 64, 128} - - sm_scale = 1.0 / (Lq**0.5) - batch, head = b_seq_len.shape[0], q.shape[1] - - grid = (batch, head, triton.cdiv(max_input_len, BLOCK)) # batch, head, - - num_warps = 8 if Lk <= 64 else 8 - if alibi_slopes is not None: - _fwd_kernel_alibi[grid]( - q, - k, - v, - k_cache, - v_cache, - b_loc, - sm_scale, - b_start_loc, - b_seq_len, - b_ctx_len, - alibi_slopes, - v_cache.shape[3], - 8, - o, - b_loc.stride(0), - b_loc.stride(1), - q.stride(0), - q.stride(1), - q.stride(2), - k.stride(0), - k.stride(1), - k.stride(2), - v.stride(0), - v.stride(1), - v.stride(2), - o.stride(0), - o.stride(1), - o.stride(2), - k_cache.stride(0), - k_cache.stride(1), - k_cache.stride(2), - k_cache.stride(3), - k_cache.stride( - 4 - ), #[num_blocks, num_kv_heads, head_size/x, block_size, x] - v_cache.stride(0), - v_cache.stride(1), - v_cache.stride(2), - v_cache.stride( - 3), #[num_blocks, num_kv_heads, head_size, block_size] - BLOCK_M=BLOCK, - BLOCK_DMODEL=Lk, - BLOCK_N=BLOCK, - num_warps=num_warps, - num_stages=1, - ) - return - - _fwd_kernel[grid]( - q, - k, - v, - k_cache, - v_cache, - b_loc, - sm_scale, - b_start_loc, - b_seq_len, - b_ctx_len, - v_cache.shape[3], - 8, - o, - b_loc.stride(0), - b_loc.stride(1), - q.stride(0), - q.stride(1), - q.stride(2), - k.stride(0), - k.stride(1), - k.stride(2), - v.stride(0), - v.stride(1), - v.stride(2), - o.stride(0), - o.stride(1), - o.stride(2), - k_cache.stride(0), - k_cache.stride(1), - k_cache.stride(2), - k_cache.stride(3), - k_cache.stride( - 4), #[num_blocks, num_kv_heads, head_size/x, block_size, x] - v_cache.stride(0), - v_cache.stride(1), - v_cache.stride(2), - v_cache.stride( - 3), #[num_blocks, num_kv_heads, head_size, block_size] - BLOCK_M=BLOCK, - BLOCK_DMODEL=Lk, - BLOCK_N=BLOCK, - num_warps=num_warps, - num_stages=1, - ) - return diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py deleted file mode 100644 index 9c5fb890251ed5e500a3e5c90691520ecdc95383..0000000000000000000000000000000000000000 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ /dev/null @@ -1,153 +0,0 @@ -from typing import Optional, Sequence - -import torch -import torch.nn.functional as F -from torch.nn.parameter import Parameter - -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, -) -from vllm.model_executor.parallel_utils.utils import divide -from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_all_reduce) -from vllm.model_executor.utils import set_weight_attrs - -DEFAULT_VOCAB_PADDING_SIZE = 64 - - -def pad_vocab_size(vocab_size: int, - pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int: - """Pad the vocab size to the given value.""" - return ((vocab_size + pad_to - 1) // pad_to) * pad_to - - -def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size: int, - rank: int) -> Sequence[int]: - index_f = rank * per_partition_vocab_size - index_l = index_f + per_partition_vocab_size - return index_f, index_l - - -def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, - world_size: int) -> Sequence[int]: - per_partition_vocab_size = divide(global_vocab_size, world_size) - return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, - rank) - - -class VocabParallelEmbedding(torch.nn.Module): - """Embedding parallelized in the vocabulary dimension. - - Adapted from torch.nn.Embedding, note that we pad the vocabulary size to - make sure it is divisible by the number of model parallel GPUs. - - Args: - num_embeddings: vocabulary size. - embedding_dim: size of hidden state. - params_dtype: type of the parameters. - org_num_embeddings: original vocabulary size (without LoRA). - padding_size: padding size for the vocabulary. - """ - - def __init__(self, - num_embeddings: int, - embedding_dim: int, - params_dtype: Optional[torch.dtype] = None, - org_num_embeddings: Optional[int] = None, - padding_size: int = DEFAULT_VOCAB_PADDING_SIZE): - super().__init__() - - # Keep the input dimensions. - self.num_embeddings = num_embeddings - self.org_vocab_size = org_num_embeddings or num_embeddings - self.num_embeddings_padded = pad_vocab_size(num_embeddings, - padding_size) - self.embedding_dim = embedding_dim - if params_dtype is None: - params_dtype = torch.get_default_dtype() - self.tp_size = get_tensor_model_parallel_world_size() - # Divide the weight matrix along the vocaburaly dimension. - self.vocab_start_index, self.vocab_end_index = ( - vocab_range_from_global_vocab_size( - self.num_embeddings_padded, get_tensor_model_parallel_rank(), - self.tp_size)) - self.num_embeddings_per_partition = (self.vocab_end_index - - self.vocab_start_index) - self.weight = Parameter( - torch.empty(self.num_embeddings_per_partition, - self.embedding_dim, - device=torch.cuda.current_device(), - dtype=params_dtype)) - set_weight_attrs(self.weight, { - "parallel_dim": 0, - "weight_loader": self.weight_loader - }) - - def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): - parallel_dim = param.parallel_dim - assert loaded_weight.shape[parallel_dim] == self.org_vocab_size - loaded_weight = loaded_weight[self.vocab_start_index:self. - vocab_end_index] - param[:loaded_weight.shape[0]].data.copy_(loaded_weight) - - def forward(self, input_): - if self.tp_size > 1: - # Build the mask. - input_mask = ((input_ < self.vocab_start_index) | - (input_ >= self.vocab_end_index)) - # Mask the input. - masked_input = input_.clone() - self.vocab_start_index - masked_input[input_mask] = 0 - else: - masked_input = input_ - # Get the embeddings. - output_parallel = F.embedding(masked_input, self.weight) - # Mask the output embedding. - if self.tp_size > 1: - output_parallel[input_mask, :] = 0.0 - # Reduce across all the model parallel GPUs. - output = tensor_model_parallel_all_reduce(output_parallel) - return output - - -class ParallelLMHead(VocabParallelEmbedding): - """Parallelized LM head. - - Output logits weight matrices used in the Sampler. The weight and bias - tensors are padded to make sure they are divisible by the number of - model parallel GPUs. - - Args: - num_embeddings: vocabulary size. - embedding_dim: size of hidden state. - bias: whether to use bias. - params_dtype: type of the parameters. - org_num_embeddings: original vocabulary size (without LoRA). - padding_size: padding size for the vocabulary. - """ - - def __init__(self, - num_embeddings: int, - embedding_dim: int, - bias: bool = False, - params_dtype: Optional[torch.dtype] = None, - org_num_embeddings: Optional[int] = None, - padding_size: int = DEFAULT_VOCAB_PADDING_SIZE): - super().__init__(num_embeddings, embedding_dim, params_dtype, - org_num_embeddings, padding_size) - if bias: - self.bias = Parameter( - torch.empty(self.num_embeddings_per_partition, - device=torch.cuda.current_device(), - dtype=params_dtype)) - set_weight_attrs(self.bias, { - "parallel_dim": 0, - "weight_loader": self.weight_loader - }) - else: - self.register_parameter("bias", None) - - def forward(self, input_): - del input_ - raise RuntimeError("LMHead's weights should be used in the sampler.") diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py deleted file mode 100644 index 0f1125e5c8e3e576be067de40be9689a36526535..0000000000000000000000000000000000000000 --- a/vllm/model_executor/model_loader.py +++ /dev/null @@ -1,85 +0,0 @@ -"""Utilities for selecting and loading models.""" -import contextlib -from typing import Optional, Type - -import torch -import torch.nn as nn -from transformers import PretrainedConfig - -from vllm.config import ModelConfig, LoRAConfig -from vllm.model_executor.models import ModelRegistry -from vllm.model_executor.weight_utils import (get_quant_config, - initialize_dummy_weights) - - -@contextlib.contextmanager -def _set_default_torch_dtype(dtype: torch.dtype): - """Sets the default torch dtype to the given dtype.""" - old_dtype = torch.get_default_dtype() - torch.set_default_dtype(dtype) - yield - torch.set_default_dtype(old_dtype) - - -def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]: - architectures = getattr(config, "architectures", []) - for arch in architectures: - model_cls = ModelRegistry.load_model_cls(arch) - if model_cls is not None: - return model_cls - raise ValueError( - f"Model architectures {architectures} are not supported for now. " - f"Supported architectures: {ModelRegistry.get_supported_archs()}") - - -def get_model(model_config: ModelConfig, - lora_config: Optional[LoRAConfig] = None) -> nn.Module: - model_class = _get_model_architecture(model_config.hf_config) - - # Get the (maybe quantized) linear method. - linear_method = None - if model_config.quantization is not None: - quant_config = get_quant_config(model_config.quantization, - model_config.model, - model_config.hf_config, - model_config.download_dir) - capability = torch.cuda.get_device_capability() - capability = capability[0] * 10 + capability[1] - if capability < quant_config.get_min_capability(): - raise ValueError( - f"The quantization method {model_config.quantization} is not " - "supported for the current GPU. " - f"Minimum capability: {quant_config.get_min_capability()}. " - f"Current capability: {capability}.") - supported_dtypes = quant_config.get_supported_act_dtypes() - if model_config.dtype not in supported_dtypes: - raise ValueError( - f"{model_config.dtype} is not supported for quantization " - f"method {model_config.quantization}. Supported dtypes: " - f"{supported_dtypes}") - linear_method = quant_config.get_linear_method() - - with _set_default_torch_dtype(model_config.dtype): - # Create a model instance. - # The weights will be initialized as empty tensors. - with torch.device("cuda"): - if getattr(model_class, "supports_lora", False): - model = model_class(model_config.hf_config, linear_method, - lora_config) - elif lora_config: - raise ValueError( - f"Model {model_class.__name__} does not support LoRA, " - "but LoRA is enabled. Support for this model may " - "be added in the future. If this is important to you, " - "please open an issue on github.") - else: - model = model_class(model_config.hf_config, linear_method) - if model_config.load_format == "dummy": - # NOTE(woosuk): For accurate performance evaluation, we assign - # random values to the weights. - initialize_dummy_weights(model) - else: - # Load the weights from the cached or downloaded files. - model.load_weights(model_config.model, model_config.download_dir, - model_config.load_format, model_config.revision) - return model.eval() diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py deleted file mode 100644 index 93631d260abcb30ab5e04c7c3343da27113ee9e3..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/__init__.py +++ /dev/null @@ -1,88 +0,0 @@ -import importlib -from typing import List, Optional, Type - -import torch.nn as nn - -from vllm.logger import init_logger -from vllm.utils import is_hip - -logger = init_logger(__name__) - -# Architecture -> (module, class). -_MODELS = { - "AquilaModel": ("aquila", "AquilaForCausalLM"), - "AquilaForCausalLM": ("aquila", "AquilaForCausalLM"), # AquilaChat2 - "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b - "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b - "BloomForCausalLM": ("bloom", "BloomForCausalLM"), - "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), - "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"), - "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), - "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"), - "FalconForCausalLM": ("falcon", "FalconForCausalLM"), - "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), - "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"), - "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"), - "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"), - "InternLMForCausalLM": ("internlm", "InternLMForCausalLM"), - "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), - # For decapoda-research/llama-* - "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), - "MistralForCausalLM": ("mistral", "MistralForCausalLM"), - "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"), - # transformers's mpt class has lower case - "MptForCausalLM": ("mpt", "MPTForCausalLM"), - "MPTForCausalLM": ("mpt", "MPTForCausalLM"), - "OPTForCausalLM": ("opt", "OPTForCausalLM"), - "PhiForCausalLM": ("phi", "PhiForCausalLM"), - "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), - "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), - "RWForCausalLM": ("falcon", "FalconForCausalLM"), - "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"), - "YiForCausalLM": ("yi", "YiForCausalLM") -} - -# Models not supported by ROCm. -_ROCM_UNSUPPORTED_MODELS = [] - -# Models partially supported by ROCm. -# Architecture -> Reason. -_ROCM_PARTIALLY_SUPPORTED_MODELS = { - "Qwen2ForCausalLM": - "Sliding window attention is not yet supported in ROCm's flash attention", - "MistralForCausalLM": - "Sliding window attention is not yet supported in ROCm's flash attention", - "MixtralForCausalLM": - "Sliding window attention is not yet supported in ROCm's flash attention", -} - - -class ModelRegistry: - - @staticmethod - def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]: - if model_arch not in _MODELS: - return None - if is_hip(): - if model_arch in _ROCM_UNSUPPORTED_MODELS: - raise ValueError( - f"Model architecture {model_arch} is not supported by " - "ROCm for now.") - if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: - logger.warning( - f"Model architecture {model_arch} is partially supported " - "by ROCm: " + _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]) - - module_name, model_cls_name = _MODELS[model_arch] - module = importlib.import_module( - f"vllm.model_executor.models.{module_name}") - return getattr(module, model_cls_name, None) - - @staticmethod - def get_supported_archs() -> List[str]: - return list(_MODELS.keys()) - - -__all__ = [ - "ModelRegistry", -] diff --git a/vllm/model_executor/models/aquila.py b/vllm/model_executor/models/aquila.py deleted file mode 100644 index 2f2bd5ffb4a63a137c6ff9c455b558c809cba75b..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/aquila.py +++ /dev/null @@ -1,342 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only LLaMA model compatible with HuggingFace weights.""" -from typing import Any, Dict, List, Optional, Tuple - -import torch -from torch import nn - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.linear import (LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput -from vllm.transformers_utils.configs.aquila import AquilaConfig - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class AquilaMLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - linear_method=linear_method) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - linear_method=linear_method) - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x - - -class AquilaRMSNorm(nn.Module): - - def __init__(self, hidden_size, eps=1e-6): - """ - AquilaRMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - variance = hidden_states.to(torch.float32).pow(2).mean(-1, - keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + - self.variance_epsilon) - - return (self.weight * hidden_states).to(input_dtype) - - -class AquilaAttention(nn.Module): - - def __init__( - self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rope_theta: float = 10000, - max_position_embeddings: int = 8192, - rope_scaling: Optional[Dict[str, Any]] = None, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - assert self.total_num_kv_heads % tp_size == 0 - self.num_kv_heads = self.total_num_kv_heads // tp_size - self.head_dim = hidden_size // self.total_num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.max_position_embeddings = max_position_embeddings - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=False, - linear_method=linear_method, - ) - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - linear_method=linear_method, - ) - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, - ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.o_proj(attn_output) - return output - - -class AquilaDecoderLayer(nn.Module): - - def __init__( - self, - config: AquilaConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - max_position_embeddings = getattr(config, "max_position_embeddings", - 8192) - self.self_attn = AquilaAttention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - max_position_embeddings=max_position_embeddings, - rope_scaling=rope_scaling, - linear_method=linear_method, - ) - self.mlp = AquilaMLP( - hidden_size=self.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - linear_method=linear_method, - ) - self.input_layernorm = AquilaRMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = AquilaRMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - # Self Attention - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - return hidden_states - - -class AquilaModel(nn.Module): - - def __init__( - self, - config: AquilaConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - ) - self.layers = nn.ModuleList([ - AquilaDecoderLayer(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - self.norm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) - for i in range(len(self.layers)): - layer = self.layers[i] - hidden_states = layer( - positions, - hidden_states, - kv_caches[i], - input_metadata, - ) - hidden_states = self.norm(hidden_states) - - return hidden_states - - -class AquilaForCausalLM(nn.Module): - - def __init__( - self, - config, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.linear_method = linear_method - self.model = AquilaModel(config, linear_method) - self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "rotary_emb.inv_freq" in name: - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py deleted file mode 100644 index f08c3c8d257ff8418af1dab08e61f3cae0b8dba2..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/baichuan.py +++ /dev/null @@ -1,386 +0,0 @@ -# coding=utf-8 -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only BaiChuan model compatible with HuggingFace weights.""" -import math -from typing import List, Optional, Tuple - -import torch -from torch import nn - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput -from vllm.transformers_utils.configs.baichuan import BaiChuanConfig - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: - closest_power_of_2 = 2**math.floor(math.log2(total_num_heads)) - base = torch.tensor( - 2**(-(2**-(math.log2(closest_power_of_2) - 3))), - dtype=torch.float32, - ) - powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32) - slopes = torch.pow(base, powers) - - if closest_power_of_2 != total_num_heads: - extra_base = torch.tensor( - 2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))), - dtype=torch.float32, - ) - num_remaining_heads = min(closest_power_of_2, - total_num_heads - closest_power_of_2) - extra_powers = torch.arange(start=1, - end=1 + 2 * num_remaining_heads, - step=2, - dtype=torch.int32) - slopes = torch.cat( - [slopes, torch.pow(extra_base, extra_powers)], dim=0) - return slopes - - -class BaiChuanMLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - linear_method=linear_method) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - linear_method=linear_method) - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x - - -class BaiChuanAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__( - self, - hidden_size: int, - num_heads: int, - position_embedding: str, - rope_theta: float = 10000, - max_position_embeddings: int = 8192, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.hidden_size = hidden_size - tensor_model_parallel_world_size = get_tensor_model_parallel_world_size( - ) - self.total_num_heads = num_heads - assert self.total_num_heads % tensor_model_parallel_world_size == 0 - self.num_heads = (self.total_num_heads // - tensor_model_parallel_world_size) - self.head_dim = hidden_size // self.total_num_heads - self.postion_embedding = position_embedding - self.rope_theta = rope_theta - self.max_position_embeddings = max_position_embeddings - - # pylint: disable=invalid-name - self.W_pack = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_heads, - bias=False, - linear_method=linear_method, - ) - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - linear_method=linear_method, - ) - # Create the alibi slopes and slice them. - if self.postion_embedding == "ALIBI": - tp_rank = get_tensor_model_parallel_rank() - head_start = tp_rank * self.num_heads - head_end = (tp_rank + 1) * self.num_heads - alibi_slopes = _get_alibi_slopes(self.total_num_heads) - alibi_slopes = alibi_slopes[head_start:head_end].tolist() - - scaling = self.head_dim**-0.5 - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scaling, - alibi_slopes=alibi_slopes) - else: - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=self.max_position_embeddings, - base=self.rope_theta, - ) - self.scaling = self.head_dim**-0.5 - self.attn = PagedAttention(self.num_heads, self.head_dim, - self.scaling) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.W_pack(hidden_states) - q, k, v = qkv.chunk(chunks=3, dim=-1) - if self.postion_embedding != "ALIBI": - q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.o_proj(attn_output) - return output - - -class BaiChuanDecoderLayer(nn.Module): - - def __init__(self, - config: BaiChuanConfig, - position_embedding: str, - linear_method: Optional[LinearMethodBase] = None): - super().__init__() - self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - max_position_embeddings = getattr(config, "max_position_embeddings", - 8192) - self.self_attn = BaiChuanAttention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - position_embedding=position_embedding, - rope_theta=rope_theta, - max_position_embeddings=max_position_embeddings, - linear_method=linear_method, - ) - self.mlp = BaiChuanMLP( - hidden_size=self.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - linear_method=linear_method, - ) - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) - hidden_states = self.mlp(hidden_states) - return hidden_states, residual - - -class BaiChuanModel(nn.Module): - - def __init__(self, - config: BaiChuanConfig, - position_embedding: str, - linear_method: Optional[LinearMethodBase] = None): - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - ) - self.layers = nn.ModuleList([ - BaiChuanDecoderLayer(config, position_embedding, linear_method) - for _ in range(config.num_hidden_layers) - ]) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) - residual = None - for i in range(len(self.layers)): - layer = self.layers[i] - hidden_states, residual = layer( - positions, - hidden_states, - kv_caches[i], - input_metadata, - residual, - ) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - -class BaiChuanBaseForCausalLM(nn.Module): - - def __init__(self, - config, - position_embedding: str, - linear_method: Optional[LinearMethodBase] = None): - super().__init__() - self.config = config - self.linear_method = linear_method - self.model = BaiChuanModel(config, position_embedding, linear_method) - self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "rotary_emb.inv_freq" in name: - continue - if name == "lm_head.weight": - # Unlike Baichuan, Baichuan2 normalizes the head weights. Refer to: - # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508 - # Distinguish between Baichuan and Baichuan2 by checking the - # vocab size. This is suggested by - # https://github.com/vllm-project/vllm/pull/1022#discussion_r1325652704 - is_baichuan2 = self.config.vocab_size == 125696 - if is_baichuan2: - loaded_weight = torch.nn.functional.normalize( - loaded_weight) - - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - - -class BaichuanForCausalLM(BaiChuanBaseForCausalLM): - """Baichuan 13B and Baichuan2 7B/13B.""" - - def __init__(self, - config, - linear_method: Optional[LinearMethodBase] = None): - if config.hidden_size == 4096: # baichuan2 7b - super().__init__(config, "ROPE", linear_method) - else: # baichuan 13b, baichuan2 13b - super().__init__(config, "ALIBI", linear_method) - - -class BaiChuanForCausalLM(BaiChuanBaseForCausalLM): - """Baichuan 7B.""" - - def __init__(self, - config, - linear_method: Optional[LinearMethodBase] = None): - super().__init__(config, "ROPE", linear_method) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py deleted file mode 100644 index 4adfb6b78102fb0439a63442e42d0c63e2d3f68f..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/bloom.py +++ /dev/null @@ -1,330 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py -# Copyright 2023 The CacheFlow team. -# Copyright 2022 HuggingFace Inc. team and BigScience workshop. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only BLOOM model compatible with HuggingFace weights.""" -import math -from typing import List, Optional, Tuple - -import torch -from torch import nn -from transformers import BloomConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearMethodBase, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: - closest_power_of_2 = 2**math.floor(math.log2(total_num_heads)) - base = torch.tensor( - 2**(-(2**-(math.log2(closest_power_of_2) - 3))), - dtype=torch.float32, - ) - powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32) - slopes = torch.pow(base, powers) - - if closest_power_of_2 != total_num_heads: - extra_base = torch.tensor( - 2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))), - dtype=torch.float32, - ) - num_remaining_heads = min(closest_power_of_2, - total_num_heads - closest_power_of_2) - extra_powers = torch.arange(start=1, - end=1 + 2 * num_remaining_heads, - step=2, - dtype=torch.int32) - slopes = torch.cat( - [slopes, torch.pow(extra_base, extra_powers)], dim=0) - return slopes - - -class BloomAttention(nn.Module): - - def __init__( - self, - config: BloomConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.hidden_size = config.hidden_size - self.total_num_heads = config.n_head - self.head_dim = self.hidden_size // self.total_num_heads - assert self.head_dim * self.total_num_heads == self.hidden_size - - tp_world_size = get_tensor_model_parallel_world_size() - assert self.total_num_heads % tp_world_size == 0 - self.num_heads = self.total_num_heads // tp_world_size - - self.query_key_value = QKVParallelLinear( - self.hidden_size, - self.head_dim, - self.total_num_heads, - bias=True, - linear_method=linear_method, - ) - self.dense = RowParallelLinear( - self.hidden_size, - self.hidden_size, - bias=True, - linear_method=linear_method, - ) - - # Create the alibi slopes and slice them. - tp_rank = get_tensor_model_parallel_rank() - head_start = tp_rank * self.num_heads - head_end = (tp_rank + 1) * self.num_heads - alibi_slopes = _get_alibi_slopes(self.total_num_heads) - alibi_slopes = alibi_slopes[head_start:head_end].tolist() - - scaling = self.head_dim**-0.5 - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scaling, - alibi_slopes=alibi_slopes) - - def forward( - self, - position_ids: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - del position_ids # Unused. - qkv, _ = self.query_key_value(hidden_states) - q, k, v = qkv.chunk(chunks=3, dim=-1) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.dense(attn_output) - return output - - -class BloomMLP(nn.Module): - - def __init__( - self, - config: BloomConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - hidden_size = config.hidden_size - self.dense_h_to_4h = ColumnParallelLinear( - hidden_size, - 4 * hidden_size, - linear_method=linear_method, - ) - quant_config = getattr(linear_method, "quant_config", None) - self.gelu_impl = get_act_fn("gelu", quant_config, 4 * hidden_size) - self.dense_4h_to_h = RowParallelLinear( - 4 * hidden_size, - hidden_size, - linear_method=linear_method, - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x, _ = self.dense_h_to_4h(x) - x = self.gelu_impl(x) - x, _ = self.dense_4h_to_h(x) - return x - - -class BloomBlock(nn.Module): - - def __init__( - self, - config: BloomConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - hidden_size = config.hidden_size - - self.input_layernorm = nn.LayerNorm(hidden_size, - eps=config.layer_norm_epsilon) - self.self_attention = BloomAttention(config, linear_method) - self.post_attention_layernorm = nn.LayerNorm( - hidden_size, eps=config.layer_norm_epsilon) - self.mlp = BloomMLP(config, linear_method) - self.apply_residual_connection_post_layernorm = ( - config.apply_residual_connection_post_layernorm) - - def forward( - self, - position_ids: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - # Layer norm at the beginning of the transformer layer. - layernorm_output = self.input_layernorm(hidden_states) - - # Layer norm post the self attention. - if self.apply_residual_connection_post_layernorm: - residual = layernorm_output - else: - residual = hidden_states - - # Self attention. - attention_output = self.self_attention( - position_ids=position_ids, - hidden_states=layernorm_output, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - attention_output = attention_output + residual - layernorm_output = self.post_attention_layernorm(attention_output) - - # Get residual - if self.apply_residual_connection_post_layernorm: - residual = layernorm_output - else: - residual = attention_output - - # MLP. - output = self.mlp(layernorm_output) + residual - return output - - -class BloomModel(nn.Module): - - def __init__( - self, - config: BloomConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.embed_dim = config.hidden_size - - # Embedding + LN Embedding - self.word_embeddings = VocabParallelEmbedding( - config.vocab_size, - self.embed_dim, - ) - self.word_embeddings_layernorm = nn.LayerNorm( - self.embed_dim, eps=config.layer_norm_epsilon) - - # Transformer blocks - self.h = nn.ModuleList([ - BloomBlock(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - - # Final Layer Norm - self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) - - def forward( - self, - input_ids: torch.Tensor, - position_ids: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.word_embeddings(input_ids) - hidden_states = self.word_embeddings_layernorm(hidden_states) - for i in range(len(self.h)): - layer = self.h[i] - hidden_states = layer( - position_ids, - hidden_states, - kv_caches[i], - input_metadata, - ) - hidden_states = self.ln_f(hidden_states) - return hidden_states - - -class BloomForCausalLM(nn.Module): - - def __init__( - self, - config: BloomConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.linear_method = linear_method - self.transformer = BloomModel(config, linear_method) - self.lm_head_weight = self.transformer.word_embeddings.weight - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.transformer(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head_weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - params_dict = dict(self.named_parameters(remove_duplicate=False)) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if name == "lm_head.weight": - continue - if not name.startswith("transformer."): - name = "transformer." + name - param = params_dict[name] - - if "query_key_value" in name: - # NOTE: BLOOM's fused QKV's output_dim has the shape of - # (num_heads * 3 * head_size), while the - # required shape is (3 * num_heads * head_size). - # Thus, we need weight conversion. - output_dim = getattr(param, "output_dim", None) - num_heads = self.config.num_attention_heads - if output_dim is not None: - loaded_weight_shape = loaded_weight.shape - loaded_weight = loaded_weight.view( - loaded_weight_shape[:output_dim] + (num_heads, 3, -1) + - loaded_weight_shape[output_dim + 1:]) - loaded_weight = loaded_weight.transpose( - output_dim, output_dim + 1) - loaded_weight = loaded_weight.reshape(loaded_weight_shape) - - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py deleted file mode 100644 index dca8d724f976b9808cc417730cbaa778aa12a706..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/chatglm.py +++ /dev/null @@ -1,375 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/THUDM/ChatGLM2-6B -"""Inference-only ChatGLM model compatible with THUDM weights.""" -from typing import List, Optional, Tuple - -import torch -from torch import nn -from torch.nn import LayerNorm - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput -from vllm.transformers_utils.configs import ChatGLMConfig - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class GLMAttention(nn.Module): - - def __init__( - self, - config, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.hidden_size = config.hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = config.num_attention_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.multi_query_attention = config.multi_query_attention - self.total_num_kv_heads = (config.multi_query_group_num - if config.multi_query_attention else - config.num_attention_heads) - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = config.hidden_size // self.total_num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - - self.query_key_value = QKVParallelLinear( - self.hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=config.add_bias_linear or config.add_qkv_bias, - linear_method=linear_method, - ) - self.dense = RowParallelLinear( - self.total_num_heads * self.head_dim, - config.hidden_size, - bias=config.add_bias_linear, - linear_method=linear_method, - ) - - # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141 - rope_ratio = getattr(config, "rope_ratio", 1.0) - max_positions = getattr(config, "seq_length", 8192) - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim // 2, - max_position=max_positions, - base=10000 * rope_ratio, - is_neox_style=False, - ) - self.attn = PagedAttention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - ) - - def forward( - self, - hidden_states: torch.Tensor, - position_ids: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.query_key_value(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.rotary_emb(position_ids, q, k) - key_cache, value_cache = kv_cache - context_layer = self.attn( - q, - k, - v, - key_cache, - value_cache, - input_metadata, - ) - attn_output, _ = self.dense(context_layer) - return attn_output - - -class GLMMLP(nn.Module): - """MLP. - - MLP will take the input with h hidden state, project it to 4*h - hidden dimension, perform nonlinear transformation, and project the - state back into h hidden dimension. - """ - - def __init__( - self, - config, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - - self.add_bias = config.add_bias_linear - - # Project to 4h. - self.dense_h_to_4h = MergedColumnParallelLinear( - config.hidden_size, - [config.ffn_hidden_size] * 2, - bias=config.add_bias_linear, - linear_method=linear_method, - ) - - self.activation_func = SiluAndMul() - - # Project back to h. - self.dense_4h_to_h = RowParallelLinear( - config.ffn_hidden_size, - config.hidden_size, - bias=config.add_bias_linear, - linear_method=linear_method, - ) - - def forward(self, hidden_states): - # [s, b, 4hp] - intermediate_parallel, _ = self.dense_h_to_4h(hidden_states) - intermediate_parallel = self.activation_func(intermediate_parallel) - # [s, b, h] - output, _ = self.dense_4h_to_h(intermediate_parallel) - return output - - -class GLMBlock(nn.Module): - """A single transformer layer. - - Transformer layer takes input with size [s, b, h] and returns an - output of the same size. - """ - - def __init__( - self, - config, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.apply_residual_connection_post_layernorm = ( - config.apply_residual_connection_post_layernorm) - - self.fp32_residual_connection = config.fp32_residual_connection - - layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm - # Layernorm on the input data. - self.input_layernorm = layer_norm_func(config.hidden_size, - eps=config.layernorm_epsilon) - - # Self attention. - self.self_attention = GLMAttention(config, linear_method) - self.hidden_dropout = config.hidden_dropout - - # Layernorm on the attention output - self.post_attention_layernorm = layer_norm_func( - config.hidden_size, eps=config.layernorm_epsilon) - - # MLP - self.mlp = GLMMLP(config, linear_method) - - def forward( - self, - hidden_states: torch.Tensor, - position_ids: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - # hidden_states: [num_tokens, h] - # Layer norm at the beginning of the transformer layer. - layernorm_output = self.input_layernorm(hidden_states) - # Self attention. - attention_output = self.self_attention( - hidden_states=layernorm_output, - position_ids=position_ids, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - - # Residual connection. - if self.apply_residual_connection_post_layernorm: - residual = layernorm_output - else: - residual = hidden_states - - layernorm_input = residual + attention_output - - # Layer norm post the self attention. - layernorm_output = self.post_attention_layernorm(layernorm_input) - - # Second residual connection. - if self.apply_residual_connection_post_layernorm: - residual = layernorm_output - else: - residual = layernorm_input - - output = self.mlp(layernorm_output) + residual - - return output - - -class GLMTransformer(nn.Module): - """Transformer class.""" - - def __init__( - self, - config, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.post_layer_norm = config.post_layer_norm - - # Number of layers. - self.num_layers = config.num_layers - - # Transformer layers. - self.layers = nn.ModuleList( - [GLMBlock(config, linear_method) for i in range(self.num_layers)]) - - if self.post_layer_norm: - layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm - # Final layer norm before output. - self.final_layernorm = layer_norm_func( - config.hidden_size, eps=config.layernorm_epsilon) - - def forward( - self, - hidden_states: torch.Tensor, - position_ids: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - for i in range(self.num_layers): - layer = self.layers[i] - hidden_states = layer( - hidden_states=hidden_states, - position_ids=position_ids, - kv_cache=kv_caches[i], - input_metadata=input_metadata, - ) - # Final layer norm. - if self.post_layer_norm: - hidden_states = self.final_layernorm(hidden_states) - - return hidden_states - - -class ChatGLMModel(nn.Module): - - def __init__( - self, - config, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - - self.embedding = VocabParallelEmbedding(config.padded_vocab_size, - config.hidden_size) - - self.num_layers = config.num_layers - self.multi_query_group_num = config.multi_query_group_num - self.kv_channels = config.kv_channels - self.encoder = GLMTransformer(config, linear_method) - - self.output_layer = ParallelLMHead(config.padded_vocab_size, - config.hidden_size) - - def forward( - self, - input_ids: torch.Tensor, - position_ids: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - inputs_embeds = self.embedding(input_ids) - - # Run encoder. - hidden_states = self.encoder( - hidden_states=inputs_embeds, - position_ids=position_ids, - kv_caches=kv_caches, - input_metadata=input_metadata, - ) - return hidden_states - - -class ChatGLMForCausalLM(nn.Module): - - def __init__( - self, - config: ChatGLMConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config: ChatGLMConfig = config - self.linear_method = linear_method - self.transformer = ChatGLMModel(config, linear_method) - self.lm_head_weight = self.transformer.output_layer.weight - self.sampler = Sampler(config.padded_vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.transformer(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head_weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - params_dict = dict(self.named_parameters(remove_duplicate=False)) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "rotary_pos_emb.inv_freq" in name: - continue - if "word_embeddings" in name: - name = name.replace(".word_embeddings", "") - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py deleted file mode 100644 index 984be0cccd16dd3b857c603587f875b4738ddb0d..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/decilm.py +++ /dev/null @@ -1,123 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py -# Copyright 2023 DeciAI Research Team. All rights reserved. -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on MistralAI GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only DeciLM model compatible with HuggingFace weights.""" - -from typing import Optional - -import torch -from transformers import PretrainedConfig - -from vllm.model_executor.layers.linear import LinearMethodBase -from vllm.model_executor.models.llama import LlamaForCausalLM -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) - - -class DeciLMForCausalLM(LlamaForCausalLM): - """ - Implementation for https://huggingface.co/Deci/DeciLM-7b-instruct. - Based on the llama executor. - - The main difference is that DeciLM uses Variable Grouped Query Attention. - The constant number of GQA heads in the decoder is overriden with a value - per layer. - - Usually, in the HuggingFace implementation, instead of - "config.num_key_value_heads", we use - "config.num_key_value_heads_per_layer[i]" which varies. - - Currently, PagedAttention does not work well with variable GQA, so we - normalize the weights upon loading, and use uniform GQA with the max value - instead. - """ - - def __init__( - self, - config: Optional[PretrainedConfig] = None, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - config.num_key_value_heads = max(config.num_key_value_heads_per_layer) - delattr(config, "num_key_value_heads_per_layer") - super().__init__(config=config, linear_method=linear_method) - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "rotary_emb.inv_freq" in name: - continue - - if "k_proj" in name or "v_proj" in name: - loaded_weight = self._degroup_weight(loaded_weight) - - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - - def _degroup_weight(self, loaded_weight: torch.Tensor) -> torch.Tensor: - hidden_size = self.config.hidden_size - head_size = self.config.hidden_size // self.config.num_attention_heads - target_num_kv_heads = self.config.num_key_value_heads - num_kv_heads = loaded_weight.shape[0] // head_size - n_repeats = target_num_kv_heads / num_kv_heads - assert n_repeats == int(n_repeats) - - n_repeats = int(n_repeats) - loaded_weight = loaded_weight.view(num_kv_heads, head_size, - hidden_size) - loaded_weight = torch.repeat_interleave(loaded_weight, - repeats=n_repeats, - dim=0) - loaded_weight = loaded_weight.reshape(target_num_kv_heads * head_size, - hidden_size) - - return loaded_weight diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py deleted file mode 100644 index fc727b8e661b37bf1fa025452c611ea550582535..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/deepseek.py +++ /dev/null @@ -1,453 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py -# Copyright 2023 The vLLM team. -# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only Deepseek model.""" -from typing import Any, Dict, List, Optional, Tuple - -import torch -from torch import nn -import torch.nn.functional as F -from transformers import PretrainedConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (LinearMethodBase, - MergedColumnParallelLinear, - ReplicatedLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) -from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_all_reduce) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class DeepseekMLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - linear_method: Optional[LinearMethodBase] = None, - reduce_results: bool = True, - ) -> None: - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - linear_method=linear_method) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - linear_method=linear_method, - reduce_results=reduce_results) - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x - - -class DeepseekMoE(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.rank = get_tensor_model_parallel_rank() - self.tp_size = get_tensor_model_parallel_world_size() - self.n_routed_experts = config.n_routed_experts - self.top_k = config.num_experts_per_tok - if self.tp_size > self.n_routed_experts: - raise ValueError( - f"Tensor parallel size {self.tp_size} is greater than " - f"the number of experts {self.n_routed_experts}.") - - self.experts = nn.ModuleList([ - DeepseekMLP(hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, - hidden_act=config.hidden_act, - linear_method=linear_method, - reduce_results=False) - for idx in range(self.n_routed_experts) - ]) - self.pack_params() - - self.gate = ReplicatedLinear(config.hidden_size, - self.n_routed_experts, - bias=False, - linear_method=None) - - if config.n_shared_experts is not None: - intermediate_size = config.moe_intermediate_size * config.n_shared_experts - self.shared_experts = DeepseekMLP( - hidden_size=config.hidden_size, - intermediate_size=intermediate_size, - hidden_act=config.hidden_act, - linear_method=linear_method, - reduce_results=False, - ) - - def pack_params(self): - w1 = [] - w2 = [] - for expert in self.experts: - w1.append(expert.gate_up_proj.weight) - w2.append(expert.down_proj.weight) - self.w1 = torch._utils._flatten_dense_tensors(w1) - w1s = torch._utils._unflatten_dense_tensors(self.w1, w1) - for data, param in zip(w1s, w1): - param.data = data - self.w1 = self.w1.view(len(w1), *w1s[0].shape) - - self.w2 = torch._utils._flatten_dense_tensors(w2) - w2s = torch._utils._unflatten_dense_tensors(self.w2, w2) - for data, param in zip(w2s, w2): - param.data = data - - self.w2 = self.w2.view(len(w2), *w2s[0].shape) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - batch_size, sequence_length, hidden_dim = hidden_states.shape - hidden_states = hidden_states.view(-1, hidden_dim) - if self.config.n_shared_experts is not None: - shared_output = self.shared_experts(hidden_states) - # router_logits: (batch * sequence_length, n_experts) - router_logits, _ = self.gate(hidden_states) - - routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) - routing_weights, selected_experts = torch.topk(routing_weights, - self.top_k, - dim=-1) - - if self.config.norm_topk_prob: - routing_weights /= routing_weights.sum(dim=-1, keepdim=True) - - final_hidden_states = fused_moe(hidden_states, - self.w1, - self.w2, - routing_weights, - selected_experts, - inplace=True) - - if self.config.n_shared_experts is not None: - final_hidden_states = final_hidden_states + shared_output - final_hidden_states = tensor_model_parallel_all_reduce( - final_hidden_states) - - return final_hidden_states.view(batch_size, sequence_length, - hidden_dim) - - -class DeepseekAttention(nn.Module): - - def __init__( - self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, - max_position_embeddings: int = 8192, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = hidden_size // self.total_num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.max_position_embeddings = max_position_embeddings - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=False, - linear_method=linear_method, - ) - - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - linear_method=linear_method, - ) - - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, - ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.o_proj(attn_output) - return output - - -class DeepseekDecoderLayer(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - layer_idx: int, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - max_position_embeddings = getattr(config, "max_position_embeddings", - 8192) - self.self_attn = DeepseekAttention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, - max_position_embeddings=max_position_embeddings, - linear_method=linear_method, - ) - if (config.n_routed_experts is not None and \ - layer_idx >= config.first_k_dense_replace and layer_idx % config.moe_layer_freq == 0): - self.mlp = DeepseekMoE(config=config, linear_method=linear_method) - else: - self.mlp = DeepseekMLP( - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - linear_method=linear_method, - ) - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - residual: Optional[torch.Tensor], - ) -> torch.Tensor: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) - hidden_states = self.mlp(hidden_states) - return hidden_states, residual - - -class DeepseekModel(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - ) - self.layers = nn.ModuleList([ - DeepseekDecoderLayer(config, - layer_idx, - linear_method=linear_method) - for layer_idx in range(config.num_hidden_layers) - ]) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) - residual = None - for i in range(len(self.layers)): - layer = self.layers[i] - hidden_states, residual = layer(positions, hidden_states, - kv_caches[i], input_metadata, - residual) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - -class DeepseekForCausalLM(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.config = config - self.linear_method = linear_method - self.model = DeepseekModel(config, linear_method) - self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, - cache_dir, - load_format, - revision, - fall_back_to_pt=False): - if "rotary_emb.inv_freq" in name: - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Skip experts that are not assigned to this worker. - if (("mlp.experts." in name or "mlp.shared_experts." in name) - and name not in params_dict): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Skip experts that are not assigned to this worker. - if (("mlp.experts." in name or "mlp.shared_experts." in name) - and name not in params_dict): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py deleted file mode 100644 index 2b5e022312e3bd08d22f4e8ebbede20756deed7a..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/falcon.py +++ /dev/null @@ -1,447 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py -# Copyright 2023 The vLLM team. -# Copyright 2023 the Falcon authors and HuggingFace Inc. team. All rights -# reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch Falcon model.""" - -import math -from typing import List, Optional, Tuple, Union - -import torch -from torch import nn -from torch.nn import LayerNorm -from transformers import FalconConfig as HF_FalconConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearMethodBase, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) -from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_all_reduce) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput -from vllm.transformers_utils.configs import RWConfig - -KVCache = Tuple[torch.Tensor, torch.Tensor] -FalconConfig = Union[HF_FalconConfig, RWConfig] - - -def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: - closest_power_of_2 = 2**math.floor(math.log2(total_num_heads)) - base = torch.tensor(2**(-(2**-(math.log2(closest_power_of_2) - 3))), - dtype=torch.float32) - powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32) - slopes = torch.pow(base, powers) - - if closest_power_of_2 != total_num_heads: - extra_base = torch.tensor( - 2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))), - dtype=torch.float32) - num_remaining_heads = min(closest_power_of_2, - total_num_heads - closest_power_of_2) - extra_powers = torch.arange(1, - 1 + 2 * num_remaining_heads, - 2, - dtype=torch.int32) - slopes = torch.cat( - [slopes, torch.pow(extra_base, extra_powers)], dim=0) - - return slopes - - -class FalconAttention(nn.Module): - - def __init__( - self, - config: FalconConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - - self.hidden_size = config.hidden_size - tp_size = get_tensor_model_parallel_world_size() - - self.total_num_heads = config.num_attention_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.head_dim = self.hidden_size // self.total_num_heads - assert self.head_dim * self.total_num_heads == self.hidden_size - - self.new_decoder_architecture = config.new_decoder_architecture - self.multi_query = config.multi_query - - if self.new_decoder_architecture: - self.total_num_kv_heads = config.num_kv_heads - elif self.multi_query: - self.total_num_kv_heads = 1 - else: - self.total_num_kv_heads = self.total_num_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - - self.query_key_value = QKVParallelLinear( - self.hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=config.bias, - skip_bias_add=True, - linear_method=linear_method, - ) - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - - # Layer-wise attention scaling - self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim) - self.reduce_row_parallel_results = not (config.new_decoder_architecture - or config.parallel_attn) - self.dense = RowParallelLinear( - self.hidden_size, - self.hidden_size, - bias=config.bias, - skip_bias_add=True, - linear_method=linear_method, - reduce_results=self.reduce_row_parallel_results) - - self.use_rotary = config.rotary - self.use_alibi = config.alibi - assert not (self.use_rotary and self.use_alibi), ( - "Rotary and alibi are mutually exclusive.") - - if self.use_rotary: - rope_theta = getattr(config, "rope_theta", 10000) - max_position_embeddings = getattr(config, - "max_position_embeddings", 8192) - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position_embeddings, - base=rope_theta, - ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.inv_norm_factor, - num_kv_heads=self.num_kv_heads) - elif self.use_alibi: - tp_rank = get_tensor_model_parallel_rank() - head_start = tp_rank * self.num_heads - head_end = (tp_rank + 1) * self.num_heads - alibi_slopes = (_get_alibi_slopes(self.total_num_heads) * - self.inv_norm_factor) - alibi_slopes = alibi_slopes[head_start:head_end].tolist() - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.inv_norm_factor, - num_kv_heads=self.num_kv_heads, - alibi_slopes=alibi_slopes) - else: - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scale=self.inv_norm_factor, - num_kv_heads=self.num_kv_heads) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, bias = self.query_key_value(hidden_states) - if bias is not None: - qkv += bias - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - if self.use_rotary: - q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - attn_output, bias = self.dense(attn_output) - return attn_output, bias - - -class FalconMLP(nn.Module): - - def __init__( - self, - config: FalconConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - hidden_size = config.hidden_size - - self.dense_h_to_4h = ColumnParallelLinear(hidden_size, - 4 * hidden_size, - bias=config.bias, - skip_bias_add=True, - linear_method=linear_method) - quant_config = getattr(linear_method, "quant_config", None) - self.act = get_act_fn("gelu", quant_config, 4 * hidden_size) - self.reduce_row_parallel_results = not (config.new_decoder_architecture - or config.parallel_attn) - self.dense_4h_to_h = RowParallelLinear( - 4 * hidden_size, - hidden_size, - bias=config.bias, - skip_bias_add=True, - reduce_results=self.reduce_row_parallel_results, - linear_method=linear_method) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - # NOTE(zhuohan): Following huggingface, we do not fuse bias add here. - x, bias = self.dense_h_to_4h(x) - if bias is not None: - x += bias - x = self.act(x) - x, bias = self.dense_4h_to_h(x) - return x, bias - - -class FalconDecoderLayer(nn.Module): - - def __init__( - self, - config: FalconConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.self_attention = FalconAttention(config, linear_method) - self.mlp = FalconMLP(config, linear_method) - self.config = config - - if config.new_decoder_architecture: - # The layer norm before self-attention - self.ln_attn = LayerNorm(hidden_size, - eps=config.layer_norm_epsilon) - # The layer norm before the MLP - self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - else: - self.input_layernorm = LayerNorm(hidden_size, - eps=config.layer_norm_epsilon) - if not config.parallel_attn: - self.post_attention_layernorm = LayerNorm( - hidden_size, eps=config.layer_norm_epsilon) - - self.reduce_row_parallel_results = not (config.new_decoder_architecture - or config.parallel_attn) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - residual = hidden_states - - if self.config.new_decoder_architecture: - attention_layernorm_out = self.ln_attn(hidden_states) - mlp_layernorm_out = self.ln_mlp(hidden_states) - else: - attention_layernorm_out = self.input_layernorm(hidden_states) - - # Self attention. - attention_output, attention_bias = self.self_attention( - positions=positions, - hidden_states=attention_layernorm_out, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - if self.reduce_row_parallel_results and attention_bias is not None: - attention_output += attention_bias - - if not self.config.new_decoder_architecture: - if self.config.parallel_attn: - mlp_layernorm_out = attention_layernorm_out - else: - residual += attention_output - mlp_layernorm_out = self.post_attention_layernorm(residual) - - # MLP. - mlp_output, mlp_bias = self.mlp(mlp_layernorm_out) - if self.reduce_row_parallel_results and mlp_bias is not None: - mlp_output += mlp_bias - - if not self.reduce_row_parallel_results: - # When MLP and Attention layers are parallel, we can use - # only one all-reduce operator to reduce the results from - # both MLP and Attention layers. - mlp_output += attention_output - mlp_output = tensor_model_parallel_all_reduce(mlp_output) - if attention_bias is not None: - mlp_output += attention_bias - if mlp_bias is not None: - mlp_output += mlp_bias - - output = mlp_output + residual - return output - - -class FalconModel(nn.Module): - - def __init__( - self, - config: FalconConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.num_heads = config.num_attention_heads - self.use_alibi = config.alibi - - # Embedding + LN Embedding - self.word_embeddings = VocabParallelEmbedding( - config.vocab_size, - self.embed_dim, - ) - - # Transformer blocks - self.h = nn.ModuleList([ - FalconDecoderLayer(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - - # Final Layer Norm - self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) - - def forward( - self, - input_ids: torch.LongTensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.word_embeddings(input_ids) - for i in range(len(self.h)): - layer = self.h[i] - hidden_states = layer( - positions, - hidden_states, - kv_caches[i], - input_metadata, - ) - hidden_states = self.ln_f(hidden_states) - return hidden_states - - -class FalconForCausalLM(nn.Module): - - def __init__( - self, - config: FalconConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.linear_method = linear_method - self.transformer = FalconModel(config, linear_method) - self.lm_head = ParallelLMHead( - config.vocab_size, - config.hidden_size, - ) - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.LongTensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.transformer( - input_ids, - positions, - kv_caches, - input_metadata, - ) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - total_num_heads = self.config.num_attention_heads - if self.config.new_decoder_architecture: - total_num_kv_heads = self.config.num_kv_heads - elif self.config.multi_query: - total_num_kv_heads = 1 - else: - total_num_kv_heads = total_num_heads - num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - if "query_key_value" in name: - output_dim = getattr(param, "output_dim", None) - loaded_weight_shape = loaded_weight.shape - if output_dim is not None: - loaded_weight = loaded_weight.view( - loaded_weight_shape[:output_dim] + - (total_num_kv_heads, num_query_heads_per_kv_head + 2, - -1) + loaded_weight_shape[output_dim + 1:]) - wq = loaded_weight.narrow( - output_dim + 1, 0, - num_query_heads_per_kv_head).reshape( - *loaded_weight_shape[:output_dim], -1, - *loaded_weight_shape[output_dim + 1:]) - wk = loaded_weight.narrow( - output_dim + 1, num_query_heads_per_kv_head, - 1).reshape(*loaded_weight_shape[:output_dim], -1, - *loaded_weight_shape[output_dim + 1:]) - wv = loaded_weight.narrow( - output_dim + 1, num_query_heads_per_kv_head + 1, - 1).reshape(*loaded_weight_shape[:output_dim], -1, - *loaded_weight_shape[output_dim + 1:]) - loaded_weight = torch.cat([wq, wk, wv], dim=output_dim) - - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py deleted file mode 100644 index 661da0fe0434ecc977d84f8f8530eda8b1481db6..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/gpt2.py +++ /dev/null @@ -1,273 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py -# Copyright 2023 The vLLM team. -# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only GPT-2 model compatible with HuggingFace weights.""" -from typing import List, Optional, Tuple - -import torch -from torch import nn -from transformers import GPT2Config - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearMethodBase, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class GPT2Attention(nn.Module): - - def __init__( - self, - config: GPT2Config, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.hidden_size = config.hidden_size - total_num_heads = config.num_attention_heads - tensor_model_parallel_world_size = ( - get_tensor_model_parallel_world_size()) - assert total_num_heads % tensor_model_parallel_world_size == 0 - self.num_heads = total_num_heads // tensor_model_parallel_world_size - self.head_dim = self.hidden_size // total_num_heads - self.scale = self.head_dim**-0.5 - - self.c_attn = QKVParallelLinear( - self.hidden_size, - self.head_dim, - total_num_heads, - bias=True, - linear_method=linear_method, - ) - self.c_proj = RowParallelLinear( - self.hidden_size, - self.hidden_size, - bias=True, - linear_method=linear_method, - ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scale=self.scale) - - def forward( - self, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.c_attn(hidden_states) - q, k, v = qkv.chunk(chunks=3, dim=-1) - key_cache, value_cache = kv_cache - attn_output = self.attn(q, k, v, key_cache, value_cache, - input_metadata) - attn_output, _ = self.c_proj(attn_output) - return attn_output - - -class GPT2MLP(nn.Module): - - def __init__( - self, - intermediate_size: int, - config: GPT2Config, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - hidden_size = config.hidden_size - self.c_fc = ColumnParallelLinear( - hidden_size, - intermediate_size, - bias=True, - linear_method=linear_method, - ) - self.c_proj = RowParallelLinear( - intermediate_size, - hidden_size, - bias=True, - linear_method=linear_method, - ) - quant_config = getattr(linear_method, "quant_config", None) - self.act = get_act_fn(config.activation_function, quant_config, - intermediate_size) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states, _ = self.c_fc(hidden_states) - hidden_states = self.act(hidden_states) - hidden_states, _ = self.c_proj(hidden_states) - return hidden_states - - -class GPT2Block(nn.Module): - - def __init__( - self, - config: GPT2Config, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - hidden_size = config.hidden_size - inner_dim = (config.n_inner if config.n_inner is not None else 4 * - hidden_size) - - self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.attn = GPT2Attention(config, linear_method) - self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.mlp = GPT2MLP(inner_dim, config, linear_method) - - def forward( - self, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - residual = hidden_states - hidden_states = self.ln_1(hidden_states) - attn_output = self.attn( - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - # residual connection - hidden_states = attn_output + residual - - residual = hidden_states - hidden_states = self.ln_2(hidden_states) - feed_forward_hidden_states = self.mlp(hidden_states) - # residual connection - hidden_states = residual + feed_forward_hidden_states - return hidden_states - - -class GPT2Model(nn.Module): - - def __init__( - self, - config: GPT2Config, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - assert not config.add_cross_attention - assert not config.scale_attn_by_inverse_layer_idx - assert not config.reorder_and_upcast_attn - self.embed_dim = config.hidden_size - self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim) - self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) - self.h = nn.ModuleList([ - GPT2Block(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) - - def forward( - self, - input_ids: torch.Tensor, - position_ids: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - inputs_embeds = self.wte(input_ids) - position_embeds = self.wpe(position_ids) - hidden_states = inputs_embeds + position_embeds - - for i in range(len(self.h)): - layer = self.h[i] - hidden_states = layer(hidden_states, kv_caches[i], input_metadata) - - hidden_states = self.ln_f(hidden_states) - return hidden_states - - -class GPT2LMHeadModel(nn.Module): - - def __init__( - self, - config: GPT2Config, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.linear_method = linear_method - self.transformer = GPT2Model(config, linear_method) - self.lm_head_weight = self.transformer.wte.weight - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.transformer(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head_weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - params_dict = dict(self.named_parameters(remove_duplicate=False)) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "lm_head.weight" in name: - # GPT-2 ties the weights of the embedding layer and the final - # linear layer. - continue - if ".attn.bias" in name or ".attn.masked_bias" in name: - # Skip attention mask. - # NOTE: "c_attn.bias" should not be skipped. - continue - if not name.startswith("transformer."): - name = "transformer." + name - param = params_dict[name] - # The HF's GPT-2 implementation uses Conv1D instead of Linear. - # Because of this, we need to transpose the weights. - # Note(zhuohan): the logic below might break quantized models. - for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]: - if conv1d_weight_name not in name: - continue - if not name.endswith(".weight"): - continue - loaded_weight = loaded_weight.t() - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py deleted file mode 100644 index ef4c1d4143c88d10dbea323d0e074253955b43bb..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/gpt_bigcode.py +++ /dev/null @@ -1,279 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py -# Copyright 2023 The vLLM team. -# Copyright 2023 CTranslate2, and Michael Feil -# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only GPTBigCode model compatible with HuggingFace weights.""" -from typing import List, Optional, Tuple - -import torch -from torch import nn -from transformers import GPTBigCodeConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearMethodBase, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class GPTBigCodeAttention(nn.Module): - - def __init__( - self, - config: GPTBigCodeConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.hidden_size = config.hidden_size - total_num_heads = config.num_attention_heads - self.tensor_model_parallel_world_size = ( - get_tensor_model_parallel_world_size()) - assert total_num_heads % self.tensor_model_parallel_world_size == 0 - self.num_heads = (total_num_heads // - self.tensor_model_parallel_world_size) - self.head_dim = self.hidden_size // total_num_heads - self.scale = self.head_dim**-0.5 - - self.multi_query = config.multi_query - if self.multi_query: - total_num_kv_heads = 1 - self.num_kv_heads = 1 - else: - total_num_kv_heads = total_num_heads - self.num_kv_heads = self.num_heads - self.kv_dim = self.head_dim * self.num_kv_heads - self.c_attn = QKVParallelLinear( - self.hidden_size, - self.head_dim, - total_num_heads, - total_num_kv_heads, - bias=True, - linear_method=linear_method, - ) - - self.c_proj = RowParallelLinear( - self.hidden_size, - self.hidden_size, - bias=True, - linear_method=linear_method, - ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scale=self.scale, - num_kv_heads=self.num_kv_heads) - - def forward( - self, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.c_attn(hidden_states) - q, k, v = qkv.split( - [ - self.hidden_size // self.tensor_model_parallel_world_size, - self.kv_dim, self.kv_dim - ], - dim=-1, - ) - key_cache, value_cache = kv_cache - attn_output = self.attn(q, k, v, key_cache, value_cache, - input_metadata) - attn_output, _ = self.c_proj(attn_output) - return attn_output - - -class GPTBigMLP(nn.Module): - - def __init__( - self, - intermediate_size: int, - config: GPTBigCodeConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - hidden_size = config.hidden_size - self.c_fc = ColumnParallelLinear( - hidden_size, - intermediate_size, - bias=True, - linear_method=linear_method, - ) - self.c_proj = RowParallelLinear( - intermediate_size, - hidden_size, - bias=True, - linear_method=linear_method, - ) - quant_config = getattr(linear_method, "quant_config", None) - self.act = get_act_fn(config.activation_function, quant_config, - intermediate_size) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states, _ = self.c_fc(hidden_states) - hidden_states = self.act(hidden_states) - hidden_states, _ = self.c_proj(hidden_states) - return hidden_states - - -class GPTBigCodeBlock(nn.Module): - - def __init__( - self, - config: GPTBigCodeConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - hidden_size = config.hidden_size - inner_dim = (config.n_inner if config.n_inner is not None else 4 * - hidden_size) - - self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.attn = GPTBigCodeAttention(config, linear_method) - self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.mlp = GPTBigMLP(inner_dim, config, linear_method) - - def forward( - self, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - residual = hidden_states - hidden_states = self.ln_1(hidden_states) - attn_output = self.attn( - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - # residual connection - hidden_states = attn_output + residual - - residual = hidden_states - hidden_states = self.ln_2(hidden_states) - feed_forward_hidden_states = self.mlp(hidden_states) - # residual connection - hidden_states = residual + feed_forward_hidden_states - return hidden_states - - -class GPTBigCodeModel(nn.Module): - - def __init__( - self, - config: GPTBigCodeConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - assert not config.add_cross_attention - - self.embed_dim = config.hidden_size - - self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim) - self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) - self.h = nn.ModuleList([ - GPTBigCodeBlock(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) - - def forward( - self, - input_ids: torch.Tensor, - position_ids: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - inputs_embeds = self.wte(input_ids) - position_embeds = self.wpe(position_ids) - hidden_states = inputs_embeds + position_embeds - - for i in range(len(self.h)): - layer = self.h[i] - hidden_states = layer(hidden_states, kv_caches[i], input_metadata) - - hidden_states = self.ln_f(hidden_states) - return hidden_states - - -class GPTBigCodeForCausalLM(nn.Module): - - def __init__( - self, - config: GPTBigCodeConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.linear_method = linear_method - self.transformer = GPTBigCodeModel(config, linear_method) - self.lm_head_weight = self.transformer.wte.weight - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.transformer(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head_weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - params_dict = dict(self.named_parameters(remove_duplicate=False)) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "lm_head.weight" in name: - continue - if ".attn.bias" in name: - # Skip attention mask. - # NOTE: "c_attn.bias" should not be skipped. - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py deleted file mode 100644 index 5bab30d9d442ee263bdd9b214fbb186fa88e3ef0..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/gpt_j.py +++ /dev/null @@ -1,284 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py -# Copyright 2023 The vLLM team. -# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only GPT-J model compatible with HuggingFace weights.""" -from typing import List, Optional, Tuple - -import torch -from torch import nn -from transformers import GPTJConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearMethodBase, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class GPTJAttention(nn.Module): - - def __init__( - self, - config: GPTJConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.total_num_heads = config.num_attention_heads - self.hidden_size = config.hidden_size - self.head_size = self.hidden_size // self.total_num_heads - - self.qkv_proj = QKVParallelLinear( - config.hidden_size, - self.head_size, - self.total_num_heads, - bias=False, - linear_method=linear_method, - ) - self.out_proj = RowParallelLinear( - config.hidden_size, - config.hidden_size, - bias=False, - linear_method=linear_method, - ) - - tp_world_size = get_tensor_model_parallel_world_size() - assert self.total_num_heads % tp_world_size == 0 - self.num_heads = self.total_num_heads // tp_world_size - - scaling = self.head_size**-0.5 - assert getattr(config, "rotary", True) - assert config.rotary_dim % 2 == 0 - rope_theta = getattr(config, "rope_theta", 10000) - max_position_embeddings = getattr(config, "max_position_embeddings", - 8192) - self.rotary_emb = get_rope( - self.head_size, - rotary_dim=config.rotary_dim, - max_position=max_position_embeddings, - base=rope_theta, - is_neox_style=False, - ) - self.attn = PagedAttention(self.num_heads, self.head_size, scaling) - - def forward( - self, - position_ids: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.chunk(chunks=3, dim=-1) - q, k = self.rotary_emb(position_ids, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - attn_output, _ = self.out_proj(attn_output) - return attn_output - - -class GPTJMLP(nn.Module): - - def __init__( - self, - intermediate_size: int, - config: GPTJConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - hidden_size = config.n_embd - self.fc_in = ColumnParallelLinear( - hidden_size, - intermediate_size, - linear_method=linear_method, - ) - self.fc_out = RowParallelLinear( - intermediate_size, - hidden_size, - linear_method=linear_method, - ) - quant_config = getattr(linear_method, "quant_config", None) - self.act = get_act_fn(config.activation_function, quant_config, - intermediate_size) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states, _ = self.fc_in(hidden_states) - hidden_states = self.act(hidden_states) - hidden_states, _ = self.fc_out(hidden_states) - return hidden_states - - -class GPTJBlock(nn.Module): - - def __init__( - self, - config: GPTJConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - inner_dim = 4 * config.n_embd if config.n_inner is None else config.n_inner - self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) - self.attn = GPTJAttention(config, linear_method) - self.mlp = GPTJMLP(inner_dim, config, linear_method) - - def forward( - self, - position_ids: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - residual = hidden_states - hidden_states = self.ln_1(hidden_states) - attn_output = self.attn( - position_ids=position_ids, - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - mlp_output = self.mlp(hidden_states) - hidden_states = attn_output + mlp_output + residual - return hidden_states - - -class GPTJModel(nn.Module): - - def __init__( - self, - config: GPTJConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.embed_dim = config.n_embd - self.wte = VocabParallelEmbedding( - config.vocab_size, - self.embed_dim, - ) - self.h = nn.ModuleList( - [GPTJBlock(config, linear_method) for _ in range(config.n_layer)]) - self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) - - def forward( - self, - input_ids: torch.Tensor, - position_ids: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.wte(input_ids) - for i in range(len(self.h)): - layer = self.h[i] - hidden_states = layer( - position_ids, - hidden_states, - kv_caches[i], - input_metadata, - ) - hidden_states = self.ln_f(hidden_states) - return hidden_states - - -class GPTJForCausalLM(nn.Module): - - def __init__( - self, - config: GPTJConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.linear_method = linear_method - assert not config.tie_word_embeddings - self.transformer = GPTJModel(config, linear_method) - self.lm_head = ParallelLMHead( - config.vocab_size, - config.n_embd, - bias=True, - ) - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.transformer(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata, self.lm_head.bias) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "attn.bias" in name or "attn.masked_bias" in name: - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py deleted file mode 100644 index 8f7e1063e0c1dfed5c81ae24f042a9e31e36b713..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/gpt_neox.py +++ /dev/null @@ -1,294 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only GPT-NeoX model compatible with HuggingFace weights.""" -from typing import List, Optional, Tuple - -import torch -from torch import nn -from transformers import GPTNeoXConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearMethodBase, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class GPTNeoXAttention(nn.Module): - - def __init__( - self, - config: GPTNeoXConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.total_num_heads = config.num_attention_heads - self.hidden_size = config.hidden_size - self.head_size = self.hidden_size // self.total_num_heads - self.bias = getattr(config, "attention_bias", True) - - tensor_model_parallel_world_size = ( - get_tensor_model_parallel_world_size()) - assert self.total_num_heads % tensor_model_parallel_world_size == 0 - self.num_heads = (self.total_num_heads // - tensor_model_parallel_world_size) - - self.query_key_value = QKVParallelLinear( - config.hidden_size, - self.head_size, - self.total_num_heads, - bias=self.bias, - linear_method=linear_method, - ) - self.dense = RowParallelLinear( - config.hidden_size, - config.hidden_size, - bias=self.bias, - linear_method=linear_method, - ) - scaling = self.head_size**-0.5 - rotary_dim = int(self.head_size * config.rotary_pct) - assert rotary_dim % 2 == 0 - rope_theta = getattr(config, "rope_theta", 10000) - max_position_embeddings = getattr(config, "max_position_embeddings", - 8192) - self.rotary_emb = get_rope( - self.head_size, - rotary_dim=rotary_dim, - max_position=max_position_embeddings, - base=rope_theta, - ) - self.attn = PagedAttention(self.num_heads, self.head_size, scaling) - - def forward( - self, - position_ids: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.query_key_value(hidden_states) - q, k, v = qkv.chunk(chunks=3, dim=-1) - q, k = self.rotary_emb(position_ids, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.dense(attn_output) - return output - - -class GPTNeoXMLP(nn.Module): - - def __init__( - self, - config: GPTNeoXConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.dense_h_to_4h = ColumnParallelLinear( - config.hidden_size, - config.intermediate_size, - linear_method=linear_method, - ) - self.dense_4h_to_h = RowParallelLinear( - config.intermediate_size, - config.hidden_size, - linear_method=linear_method, - ) - quant_config = getattr(linear_method, "quant_config", None) - self.act = get_act_fn(config.hidden_act, quant_config, - config.intermediate_size) - - def forward(self, hidden_states): - hidden_states, _ = self.dense_h_to_4h(hidden_states) - hidden_states = self.act(hidden_states) - hidden_states, _ = self.dense_4h_to_h(hidden_states) - return hidden_states - - -class GPTNeoXLayer(nn.Module): - - def __init__( - self, - config: GPTNeoXConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.use_parallel_residual = config.use_parallel_residual - self.input_layernorm = nn.LayerNorm(config.hidden_size, - eps=config.layer_norm_eps) - self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, - eps=config.layer_norm_eps) - self.attention = GPTNeoXAttention(config, linear_method) - self.mlp = GPTNeoXMLP(config, linear_method) - - def forward( - self, - position_ids: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - attn_input = self.input_layernorm(hidden_states) - attn_output = self.attention( - position_ids=position_ids, - hidden_states=attn_input, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - - if self.use_parallel_residual: - # pseudocode: - # x = x + attn(ln1(x)) + mlp(ln2(x)) - mlp_input = self.post_attention_layernorm(hidden_states) - mlp_output = self.mlp(mlp_input) - hidden_states = mlp_output + attn_output + hidden_states - else: - # pseudocode: - # x = x + attn(ln1(x)) - # x = x + mlp(ln2(x)) - attn_output = attn_output + hidden_states - mlp_input = self.post_attention_layernorm(attn_output) - mlp_output = self.mlp(mlp_input) - hidden_states = mlp_output + attn_output - return hidden_states - - -class GPTNeoXModel(nn.Module): - - def __init__( - self, - config: GPTNeoXConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - - self.embed_in = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - ) - self.layers = nn.ModuleList([ - GPTNeoXLayer(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - self.final_layer_norm = nn.LayerNorm(config.hidden_size, - eps=config.layer_norm_eps) - - def forward( - self, - input_ids: torch.Tensor, - position_ids: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.embed_in(input_ids) - for i in range(len(self.layers)): - layer = self.layers[i] - hidden_states = layer( - position_ids, - hidden_states, - kv_caches[i], - input_metadata, - ) - hidden_states = self.final_layer_norm(hidden_states) - return hidden_states - - -class GPTNeoXForCausalLM(nn.Module): - - def __init__( - self, - config, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.linear_method = linear_method - self.gpt_neox = GPTNeoXModel(config, linear_method) - self.embed_out = ParallelLMHead( - config.vocab_size, - config.hidden_size, - ) - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.gpt_neox(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.embed_out.weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if ("attention.bias" in name or "attention.masked_bias" in name - or "rotary_emb.inv_freq" in name): - continue - param = params_dict[name] - - if "query_key_value" in name: - # NOTE: GPT-NeoX's fused QKV's output_dim has the shape of - # (num_heads * 3 * head_size), while the - # required shape is (3 * num_heads * head_size). - # Thus, we need weight conversion. - output_dim = getattr(param, "output_dim", None) - num_heads = self.config.num_attention_heads - if output_dim is not None: - loaded_weight_shape = loaded_weight.shape - loaded_weight = loaded_weight.view( - loaded_weight_shape[:output_dim] + (num_heads, 3, -1) + - loaded_weight_shape[output_dim + 1:]) - loaded_weight = loaded_weight.transpose( - output_dim, output_dim + 1) - loaded_weight = loaded_weight.reshape(loaded_weight_shape) - - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py deleted file mode 100644 index 5d0b93793c89d5aefcf15f3d90d2e274f12db116..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/internlm.py +++ /dev/null @@ -1,299 +0,0 @@ -# -*- coding: utf-8 -*- -from typing import Any, Dict, List, Optional, Tuple - -import torch -from torch import nn -from transformers import LlamaConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class InternLMMLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - linear_method=linear_method) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - linear_method=linear_method) - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x - - -class InternLMAttention(nn.Module): - - def __init__( - self, - hidden_size: int, - num_heads: int, - bias: bool, - rope_theta: float = 10000, - max_position_embeddings: int = 8192, - linear_method: Optional[LinearMethodBase] = None, - rope_scaling: Optional[Dict[str, Any]] = None, - ): - super().__init__() - self.hidden_size = hidden_size - tensor_model_parallel_world_size = ( - get_tensor_model_parallel_world_size()) - self.total_num_heads = num_heads - assert self.total_num_heads % tensor_model_parallel_world_size == 0 - self.num_heads = (self.total_num_heads // - tensor_model_parallel_world_size) - self.head_dim = hidden_size // self.total_num_heads - self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.max_position_embeddings = max_position_embeddings - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - bias=bias, - linear_method=linear_method, - ) - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=bias, - linear_method=linear_method, - ) - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, - ) - self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.chunk(chunks=3, dim=-1) - q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.o_proj(attn_output) - return output - - -class InternLMDecoderLayer(nn.Module): - - def __init__( - self, - config: LlamaConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - max_position_embeddings = getattr(config, "max_position_embeddings", - 8192) - self.self_attn = InternLMAttention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - bias=config.bias, - rope_theta=rope_theta, - max_position_embeddings=max_position_embeddings, - linear_method=linear_method, - rope_scaling=getattr(config, "rope_scaling", None), - ) - self.mlp = InternLMMLP( - hidden_size=self.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - linear_method=linear_method, - ) - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) - hidden_states = self.mlp(hidden_states) - return hidden_states, residual - - -class InternLMModel(nn.Module): - - def __init__( - self, - config: LlamaConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - vocab_size = ((config.vocab_size + 63) // 64) * 64 - self.embed_tokens = VocabParallelEmbedding( - vocab_size, - config.hidden_size, - ) - self.layers = nn.ModuleList([ - InternLMDecoderLayer(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) - residual = None - for i in range(len(self.layers)): - layer = self.layers[i] - hidden_states, residual = layer( - positions, - hidden_states, - kv_caches[i], - input_metadata, - residual, - ) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - -class InternLMForCausalLM(nn.Module): - - def __init__( - self, - config, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.linear_method = linear_method - self.model = InternLMModel(config, linear_method) - self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "rotary_emb.inv_freq" in name: - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py deleted file mode 100644 index e5a1abebf142025de4820e6a94c118d1e9ce9fcd..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/llama.py +++ /dev/null @@ -1,359 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only LLaMA model compatible with HuggingFace weights.""" -from typing import Any, Dict, List, Optional, Tuple - -import torch -from torch import nn -from transformers import LlamaConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput -from vllm.config import LoRAConfig - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class LlamaMLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - linear_method=linear_method) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - linear_method=linear_method) - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x - - -class LlamaAttention(nn.Module): - - def __init__( - self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, - max_position_embeddings: int = 8192, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = hidden_size // self.total_num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.max_position_embeddings = max_position_embeddings - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=False, - linear_method=linear_method, - ) - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - linear_method=linear_method, - ) - - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, - ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.o_proj(attn_output) - return output - - -class LlamaDecoderLayer(nn.Module): - - def __init__( - self, - config: LlamaConfig, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - max_position_embeddings = getattr(config, "max_position_embeddings", - 8192) - self.self_attn = LlamaAttention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, - max_position_embeddings=max_position_embeddings, - linear_method=linear_method, - ) - self.mlp = LlamaMLP( - hidden_size=self.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - linear_method=linear_method, - ) - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) - hidden_states = self.mlp(hidden_states) - return hidden_states, residual - - -class LlamaModel(nn.Module): - - def __init__( - self, - config: LlamaConfig, - linear_method: Optional[LinearMethodBase] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - lora_vocab = (lora_config.lora_extra_vocab_size * - (lora_config.max_loras or 1)) if lora_config else 0 - self.vocab_size = config.vocab_size + lora_vocab - self.org_vocab_size = config.vocab_size - self.embed_tokens = VocabParallelEmbedding( - self.vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, - ) - self.layers = nn.ModuleList([ - LlamaDecoderLayer(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) - residual = None - for i in range(len(self.layers)): - layer = self.layers[i] - hidden_states, residual = layer( - positions, - hidden_states, - kv_caches[i], - input_metadata, - residual, - ) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - -class LlamaForCausalLM(nn.Module): - supports_lora = True - - def __init__( - self, - config: LlamaConfig, - linear_method: Optional[LinearMethodBase] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: - super().__init__() - self.config = config - self.linear_method = linear_method - self.model = LlamaModel(config, linear_method, lora_config=lora_config) - unpadded_vocab_size = config.vocab_size - if lora_config: - unpadded_vocab_size += lora_config.lora_extra_vocab_size - self.lm_head = ParallelLMHead( - unpadded_vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE - # We need bigger padding if using lora for kernel - # compatibility - if not lora_config else lora_config.lora_vocab_padding_size, - ) - self.sampler = Sampler(unpadded_vocab_size, config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "rotary_emb.inv_freq" in name: - continue - if ("rotary_emb.cos_cached" in name - or "rotary_emb.sin_cached" in name): - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/mistral.py b/vllm/model_executor/models/mistral.py deleted file mode 100644 index 01cde678441222924ad513c21360fcfa0f9db1b1..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/mistral.py +++ /dev/null @@ -1,352 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only Mistral model compatible with HuggingFace weights.""" -from typing import List, Optional, Tuple - -import torch -from torch import nn -from transformers import MistralConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput -from vllm.config import LoRAConfig - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class MistralMLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - linear_method=linear_method) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - linear_method=linear_method) - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x - - -class MistralAttention(nn.Module): - - def __init__(self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - max_position: int = 4096 * 32, - rope_theta: float = 10000, - linear_method: Optional[LinearMethodBase] = None, - sliding_window: Optional[int] = None) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = hidden_size // self.total_num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.sliding_window = sliding_window - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=False, - linear_method=linear_method, - ) - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - linear_method=linear_method, - ) - - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position, - base=self.rope_theta, - ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - sliding_window=self.sliding_window) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.o_proj(attn_output) - return output - - -class MistralDecoderLayer(nn.Module): - - def __init__( - self, - config: MistralConfig, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) - self.self_attn = MistralAttention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - max_position=config.max_position_embeddings, - num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - linear_method=linear_method, - sliding_window=config.sliding_window) - self.mlp = MistralMLP( - hidden_size=self.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - linear_method=linear_method, - ) - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) - hidden_states = self.mlp(hidden_states) - return hidden_states, residual - - -class MistralModel(nn.Module): - - def __init__( - self, - config: MistralConfig, - linear_method: Optional[LinearMethodBase] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - lora_vocab = (lora_config.lora_extra_vocab_size * - (lora_config.max_loras or 1)) if lora_config else 0 - self.vocab_size = config.vocab_size + lora_vocab - self.org_vocab_size = config.vocab_size - - self.embed_tokens = VocabParallelEmbedding( - self.vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, - ) - self.layers = nn.ModuleList([ - MistralDecoderLayer(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) - residual = None - for i in range(len(self.layers)): - layer = self.layers[i] - hidden_states, residual = layer( - positions, - hidden_states, - kv_caches[i], - input_metadata, - residual, - ) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - -class MistralForCausalLM(nn.Module): - supports_lora = True - - def __init__( - self, - config: MistralConfig, - linear_method: Optional[LinearMethodBase] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: - super().__init__() - self.config = config - self.linear_method = linear_method - self.model = MistralModel(config, - linear_method, - lora_config=lora_config) - unpadded_vocab_size = config.vocab_size - if lora_config: - unpadded_vocab_size += lora_config.lora_extra_vocab_size - self.lm_head = ParallelLMHead( - unpadded_vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE - # We need bigger padding if using lora for kernel - # compatibility - if not lora_config else lora_config.lora_vocab_padding_size, - ) - self.sampler = Sampler(unpadded_vocab_size, config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "rotary_emb.inv_freq" in name: - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py deleted file mode 100644 index f36c35fd27ad52716a8f6cc1ca8670c77c9f1819..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/mixtral.py +++ /dev/null @@ -1,420 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only Mixtral model.""" -from typing import List, Optional, Tuple - -import torch -import torch.nn.functional as F - -from torch import nn -from transformers import MixtralConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (LinearMethodBase, - QKVParallelLinear, - ReplicatedLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) -from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_all_reduce) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.utils import set_weight_attrs -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class MixtralMoE(nn.Module): - """A tensor-parallel MoE implementation for Mixtral that shards each expert - across all ranks. - - Each expert's weights are sharded across all ranks and a fused MoE - kernel is used for the forward pass, and finally we reduce the outputs - across ranks. - """ - - def __init__( - self, - num_experts: int, - top_k: int, - hidden_size: int, - intermediate_size: int, - params_dtype: Optional[torch.dtype] = None, - ): - super().__init__() - tp_size = get_tensor_model_parallel_world_size() - self.num_total_experts = num_experts - self.top_k = top_k - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size // tp_size - - if params_dtype is None: - params_dtype = torch.get_default_dtype() - self.params_dtype = params_dtype - - self.gate = ReplicatedLinear(self.hidden_size, - self.num_total_experts, - bias=False, - params_dtype=self.params_dtype, - linear_method=None) - - self.ws = nn.Parameter( - torch.empty(self.num_total_experts, - 2 * self.intermediate_size, - self.hidden_size, - device="cuda", - dtype=self.params_dtype)) - self.w2s = nn.Parameter( - torch.empty(self.num_total_experts, - self.hidden_size, - self.intermediate_size, - device="cuda", - dtype=self.params_dtype)) - - set_weight_attrs(self.ws, { - "weight_loader": self.weight_loader, - }) - set_weight_attrs(self.w2s, { - "weight_loader": self.weight_loader, - }) - - def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, - weight_name: str, expert_id: int): - tp_rank = get_tensor_model_parallel_rank() - param_data = param.data - shard_size = self.intermediate_size - shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size) - if weight_name.endswith("w1.weight"): - param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :] - if weight_name.endswith("w3.weight"): - param_data[expert_id, - shard_size:2 * shard_size, :] = loaded_weight[shard, :] - if weight_name.endswith("w2.weight"): - param_data[expert_id, :, :] = loaded_weight[:, shard] - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - batch_size, sequence_length, hidden_size = hidden_states.shape - hidden_states = hidden_states.view(-1, self.hidden_size) - # router_logits: (batch * sequence_length, n_experts) - router_logits, _ = self.gate(hidden_states) - - routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) - routing_weights, selected_experts = torch.topk(routing_weights, - self.top_k, - dim=-1) - routing_weights /= routing_weights.sum(dim=-1, keepdim=True) - - final_hidden_states = fused_moe(hidden_states, - self.ws, - self.w2s, - routing_weights, - selected_experts, - inplace=True) - - final_hidden_states = tensor_model_parallel_all_reduce( - final_hidden_states) - - return final_hidden_states.view(batch_size, sequence_length, - hidden_size) - - -class MixtralAttention(nn.Module): - - def __init__(self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - max_position: int = 4096 * 32, - rope_theta: float = 10000, - linear_method: Optional[LinearMethodBase] = None, - sliding_window: Optional[int] = None) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = hidden_size // self.total_num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.sliding_window = sliding_window - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=False, - linear_method=linear_method, - ) - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - linear_method=linear_method, - ) - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position, - base=int(self.rope_theta), - is_neox_style=True, - ) - self.attn = PagedAttention( - self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - sliding_window=self.sliding_window, - ) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.o_proj(attn_output) - return output - - -class MixtralDecoderLayer(nn.Module): - - def __init__( - self, - config: MixtralConfig, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) - self.self_attn = MixtralAttention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - max_position=config.max_position_embeddings, - num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - sliding_window=config.sliding_window, - linear_method=linear_method) - self.block_sparse_moe = MixtralMoE( - num_experts=config.num_local_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size) - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - residual: Optional[torch.Tensor], - ) -> torch.Tensor: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) - hidden_states = self.block_sparse_moe(hidden_states) - return hidden_states, residual - - -class MixtralModel(nn.Module): - - def __init__( - self, - config: MixtralConfig, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - ) - self.layers = nn.ModuleList([ - MixtralDecoderLayer(config, linear_method=linear_method) - for _ in range(config.num_hidden_layers) - ]) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) - residual = None - for i in range(len(self.layers)): - layer = self.layers[i] - hidden_states, residual = layer(positions, hidden_states, - kv_caches[i], input_metadata, - residual) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - -class MixtralForCausalLM(nn.Module): - - def __init__( - self, - config: MixtralConfig, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.config = config - self.linear_method = linear_method - self.model = MixtralModel(config, linear_method) - self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: Optional[torch.Tensor], - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - - expert_params_mapping = [ - # (param_name, weight_name, expert_id) - ("ws" if weight_name in ["w1", "w3"] else "w2s", - f"experts.{expert_id}.{weight_name}.weight", expert_id) - for expert_id in range(self.config.num_local_experts) - for weight_name in ["w1", "w2", "w3"] - ] - - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, - cache_dir, - load_format, - revision, - fall_back_to_pt=False): - if "rotary_emb.inv_freq" in name: - continue - - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - for param_name, weight_name, expert_id in expert_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, - loaded_weight, - weight_name, - expert_id=expert_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py deleted file mode 100644 index 22a876e2ef69137c7f3fd5e2aa7ea3e10e3baf4a..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/mpt.py +++ /dev/null @@ -1,298 +0,0 @@ -# coding=utf-8 -# Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main -import math -from typing import List, Optional, Tuple - -import torch -import torch.nn as nn - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearMethodBase, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput -from vllm.transformers_utils.configs.mpt import MPTConfig - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -def _get_alibi_slopes( - total_num_heads: int, - alibi_bias_max: int, -) -> torch.Tensor: - next_power_of_2 = 2**math.ceil(math.log2(total_num_heads)) - m = torch.arange(1, next_power_of_2 + 1, dtype=torch.float32) - m = m.mul(alibi_bias_max / next_power_of_2) - slopes = 1.0 / torch.pow(2, m) - if next_power_of_2 != total_num_heads: - slopes = torch.concat([slopes[1::2], slopes[::2]])[:total_num_heads] - return slopes - - -class MPTAttention(nn.Module): - - def __init__( - self, - config: MPTConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.d_model = config.d_model - self.total_num_heads = config.n_heads - self.head_dim = self.d_model // self.total_num_heads - self.clip_qkv = config.attn_config["clip_qkv"] - self.qk_ln = config.attn_config["qk_ln"] - self.alibi_bias_max = config.attn_config["alibi_bias_max"] - if "kv_n_heads" in config.attn_config: - self.total_num_kv_heads = config.attn_config['kv_n_heads'] - else: - self.total_num_kv_heads = self.total_num_heads - assert not config.attn_config["prefix_lm"] - assert config.attn_config["alibi"] - - # pylint: disable=invalid-name - self.Wqkv = QKVParallelLinear( - self.d_model, - self.d_model // self.total_num_heads, - self.total_num_heads, - self.total_num_kv_heads, - bias=not config.no_bias, - linear_method=linear_method, - ) - if self.qk_ln: - self.q_ln = nn.LayerNorm(self.d_model) - self.k_ln = nn.LayerNorm(self.d_model) - self.out_proj = RowParallelLinear( - self.d_model, - self.d_model, - bias=not config.no_bias, - linear_method=linear_method, - ) - - tp_world_size = get_tensor_model_parallel_world_size() - assert self.total_num_heads % tp_world_size == 0 - self.num_heads = self.total_num_heads // tp_world_size - - if self.total_num_kv_heads >= tp_world_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_world_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_world_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size) - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - # Create the alibi slopes and slice them. - tp_rank = get_tensor_model_parallel_rank() - head_start = tp_rank * self.num_heads - head_end = (tp_rank + 1) * self.num_heads - alibi_slopes = _get_alibi_slopes(self.total_num_heads, - self.alibi_bias_max) - alibi_slopes = alibi_slopes[head_start:head_end].tolist() - - self.head_dim = self.d_model // self.total_num_heads - scaling = self.head_dim**-0.5 - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scaling, - alibi_slopes=alibi_slopes, - num_kv_heads=self.num_kv_heads) - - def forward( - self, - position_ids: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - del position_ids # unused. - qkv, _ = self.Wqkv(hidden_states) - if self.clip_qkv is not None: - qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - if self.qk_ln: - q = self.q_ln(q) - k = self.k_ln(k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.out_proj(attn_output) - return output - - -class MPTMLP(nn.Module): - - def __init__( - self, - config: MPTConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - hidden_size = config.d_model - expansion_ratio = config.expansion_ratio - intermediate_size = expansion_ratio * hidden_size - self.up_proj = ColumnParallelLinear( - hidden_size, - intermediate_size, - bias=not config.no_bias, - linear_method=linear_method, - ) - quant_config = getattr(linear_method, "quant_config", None) - self.act = get_act_fn("gelu", quant_config, intermediate_size) - self.down_proj = RowParallelLinear( - intermediate_size, - hidden_size, - bias=not config.no_bias, - linear_method=linear_method, - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x, _ = self.up_proj(x) - x = self.act(x) - x, _ = self.down_proj(x) - return x - - -class MPTBlock(nn.Module): - - def __init__( - self, - config: MPTConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - hidden_size = config.d_model - self.norm_1 = nn.LayerNorm(hidden_size) - self.attn = MPTAttention(config, linear_method) - self.norm_2 = nn.LayerNorm(hidden_size) - self.ffn = MPTMLP(config, linear_method) - - def forward( - self, - position_ids: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - x = self.norm_1(hidden_states) - x = self.attn( - position_ids=position_ids, - hidden_states=x, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - hidden_states = hidden_states + x - x = self.norm_2(hidden_states) - x = self.ffn(x) - hidden_states = hidden_states + x - return hidden_states - - -class MPTModel(nn.Module): - - def __init__( - self, - config: MPTConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - assert config.embedding_fraction == 1.0 - assert config.norm_type == "low_precision_layernorm" - - self.wte = VocabParallelEmbedding( - config.vocab_size, - config.d_model, - ) - self.blocks = nn.ModuleList( - [MPTBlock(config, linear_method) for _ in range(config.n_layers)]) - self.norm_f = nn.LayerNorm(config.d_model) - if config.no_bias: - for module in self.modules(): - if hasattr(module, "bias") and isinstance( - module.bias, nn.Parameter): - # Remove the bias term in Linear and LayerNorm. - module.register_parameter("bias", None) - - def forward( - self, - input_ids: torch.Tensor, - position_ids: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.wte(input_ids) - for i in range(len(self.blocks)): - block = self.blocks[i] - hidden_states = block( - position_ids, - hidden_states, - kv_caches[i], - input_metadata, - ) - hidden_states = self.norm_f(hidden_states) - return hidden_states - - -class MPTForCausalLM(nn.Module): - - def __init__( - self, - config: MPTConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - assert config.tie_word_embeddings - self.linear_method = linear_method - - self.transformer = MPTModel(config, linear_method) - self.lm_head_weight = self.transformer.wte.weight - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.transformer(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head_weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - params_dict = dict(self.named_parameters(remove_duplicate=False)) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py deleted file mode 100644 index 393b2dcabcd5a5a750d6a570c546c155f59054c5..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/opt.py +++ /dev/null @@ -1,354 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py -# Copyright 2023 The vLLM team. -# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights -# reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only OPT model compatible with HuggingFace weights.""" -from typing import List, Optional, Tuple - -import torch -from torch import nn -from transformers import OPTConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearMethodBase, - QKVParallelLinear, - ReplicatedLinear, - RowParallelLinear) -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class OPTLearnedPositionalEmbedding(nn.Embedding): - - def __init__(self, num_embeddings: int, embedding_dim: int): - # OPT is set up so that if padding_idx is specified then offset the - # embedding ids by 2 and adjust num_embeddings appropriately. Other - # models don't have this hack - self.offset = 2 - super().__init__(num_embeddings + self.offset, embedding_dim) - - def forward(self, positions: torch.Tensor): - return super().forward(positions + self.offset) - - -class OPTAttention(nn.Module): - - def __init__( - self, - embed_dim: int, - num_heads: int, - bias: bool = True, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.embed_dim = embed_dim - tensor_model_parallel_world_size = ( - get_tensor_model_parallel_world_size()) - total_num_heads = num_heads - assert num_heads % tensor_model_parallel_world_size == 0 - self.num_heads = total_num_heads // tensor_model_parallel_world_size - self.head_dim = embed_dim // total_num_heads - self.scaling = self.head_dim**-0.5 - - self.qkv_proj = QKVParallelLinear( - embed_dim, - self.head_dim, - total_num_heads, - bias=bias, - linear_method=linear_method, - ) - self.out_proj = RowParallelLinear( - embed_dim, - embed_dim, - bias=bias, - linear_method=linear_method, - ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scale=self.scaling) - - def forward( - self, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.chunk(chunks=3, dim=-1) - key_cache, value_cache = kv_cache - attn_output = self.attn(q, k, v, key_cache, value_cache, - input_metadata) - output, _ = self.out_proj(attn_output) - return output - - -class OPTDecoderLayer(nn.Module): - - def __init__( - self, - config: OPTConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.self_attn = OPTAttention( - embed_dim=self.embed_dim, - num_heads=config.num_attention_heads, - bias=config.enable_bias, - linear_method=linear_method, - ) - self.do_layer_norm_before = config.do_layer_norm_before - - self.self_attn_layer_norm = nn.LayerNorm( - self.embed_dim, - elementwise_affine=config.layer_norm_elementwise_affine) - self.fc1 = ColumnParallelLinear( - self.embed_dim, - config.ffn_dim, - bias=config.enable_bias, - linear_method=linear_method, - ) - quant_config = getattr(linear_method, "quant_config", None) - self.activation_fn = get_act_fn(config.activation_function, - quant_config, config.ffn_dim) - self.fc2 = RowParallelLinear( - config.ffn_dim, - self.embed_dim, - bias=config.enable_bias, - linear_method=linear_method, - ) - self.final_layer_norm = nn.LayerNorm( - self.embed_dim, - elementwise_affine=config.layer_norm_elementwise_affine) - - def forward( - self, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - # Self Attention - residual = hidden_states - # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention - if self.do_layer_norm_before: - hidden_states = self.self_attn_layer_norm(hidden_states) - hidden_states = self.self_attn(hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata) - hidden_states = residual + hidden_states - # 350m applies layer norm AFTER attention - if not self.do_layer_norm_before: - hidden_states = self.self_attn_layer_norm(hidden_states) - - # Fully Connected - residual = hidden_states - # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention - if self.do_layer_norm_before: - hidden_states = self.final_layer_norm(hidden_states) - hidden_states, _ = self.fc1(hidden_states) - hidden_states = self.activation_fn(hidden_states) - hidden_states, _ = self.fc2(hidden_states) - hidden_states = residual + hidden_states - # 350m applies layer norm AFTER attention - if not self.do_layer_norm_before: - hidden_states = self.final_layer_norm(hidden_states) - return hidden_states - - -class OPTDecoder(nn.Module): - - def __init__( - self, - config: OPTConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - self.max_target_positions = config.max_position_embeddings - self.vocab_size = config.vocab_size - - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.word_embed_proj_dim, - ) - # Positional embeddings are replicated (not sharded). - self.embed_positions = OPTLearnedPositionalEmbedding( - config.max_position_embeddings, config.hidden_size) - - # Project out & in will be replicated if they exist. - if config.word_embed_proj_dim != config.hidden_size: - self.project_out = ReplicatedLinear(config.hidden_size, - config.word_embed_proj_dim, - bias=False, - linear_method=linear_method) - else: - self.project_out = None - - if config.word_embed_proj_dim != config.hidden_size: - self.project_in = ReplicatedLinear(config.word_embed_proj_dim, - config.hidden_size, - bias=False, - linear_method=linear_method) - else: - self.project_in = None - - # Note that the only purpose of `config._remove_final_layer_norm` is to - # keep backward compatibility with checkpoints that have been fine-tuned - # before transformers v4.20.1 - # see https://github.com/facebookresearch/metaseq/pull/164 - if config.do_layer_norm_before and not config._remove_final_layer_norm: - self.final_layer_norm = nn.LayerNorm( - config.hidden_size, - elementwise_affine=config.layer_norm_elementwise_affine) - else: - self.final_layer_norm = None - - self.layers = nn.ModuleList([ - OPTDecoderLayer(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - inputs_embeds = self.embed_tokens(input_ids) - pos_embeds = self.embed_positions(positions) - if self.project_in is not None: - inputs_embeds, _ = self.project_in(inputs_embeds) - hidden_states = inputs_embeds + pos_embeds - - for i in range(len(self.layers)): - layer = self.layers[i] - hidden_states = layer(hidden_states, kv_caches[i], input_metadata) - - if self.final_layer_norm is not None: - hidden_states = self.final_layer_norm(hidden_states) - if self.project_out is not None: - hidden_states, _ = self.project_out(hidden_states) - return hidden_states - - -class OPTModel(nn.Module): - - def __init__( - self, - config: OPTConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.decoder = OPTDecoder(config, linear_method) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - return self.decoder(input_ids, positions, kv_caches, input_metadata) - - -class OPTForCausalLM(nn.Module): - - def __init__( - self, - config, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.linear_method = linear_method - self.model = OPTModel(config, linear_method) - self.lm_head_weight = self.model.decoder.embed_tokens.weight - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head_weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ] - params_dict = dict(self.named_parameters(remove_duplicate=False)) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "lm_head.weight" in name: - continue - if name.startswith("decoder."): - name = "model." + name - - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py deleted file mode 100644 index d143261968288ffc9f012e536b3c72c391f26407..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/phi.py +++ /dev/null @@ -1,305 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py -# Copyright 2023 The vLLM team. -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. -# -# BSD 3-Clause License -# -# Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -"""Inference-only Phi-1.5 model compatible with HuggingFace weights.""" -from typing import List, Optional, Tuple - -import torch -from torch import nn -from transformers import PretrainedConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearMethodBase, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class PhiAttention(nn.Module): - - def __init__(self, - config: PretrainedConfig, - linear_method: Optional[LinearMethodBase] = None): - super().__init__() - self.total_num_heads = config.num_attention_heads - self.hidden_size = config.hidden_size - self.head_size = self.hidden_size // self.total_num_heads - - tensor_model_parallel_world_size = ( - get_tensor_model_parallel_world_size()) - assert self.total_num_heads % tensor_model_parallel_world_size == 0 - self.num_heads = (self.total_num_heads // - tensor_model_parallel_world_size) - - # pylint: disable=C0103 - self.qkv_proj = QKVParallelLinear( - self.hidden_size, - self.head_size, - self.total_num_heads, - bias=True, - linear_method=linear_method, - ) - self.dense = RowParallelLinear( - self.hidden_size, - self.hidden_size, - linear_method=linear_method, - ) - - scaling = self.head_size**-0.5 - rotary_dim = int(config.partial_rotary_factor * - (config.hidden_size // config.num_attention_heads)) - assert rotary_dim % 2 == 0 - - # pylint: disable=C0301 - # Refer to: - # https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518 - rope_theta = 10000 - max_position_embeddings = getattr(config, "n_positions", 2048) - self.rotary_emb = get_rope( - self.head_size, - rotary_dim=rotary_dim, - max_position=max_position_embeddings, - base=rope_theta, - ) - self.attn = PagedAttention(self.num_heads, self.head_size, scaling) - - def forward( - self, - position_ids: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.chunk(chunks=3, dim=-1) - q, k = self.rotary_emb(position_ids, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.dense(attn_output) - return output - - -class PhiMLP(nn.Module): - - def __init__(self, - config: PretrainedConfig, - linear_method: Optional[LinearMethodBase] = None): - super().__init__() - - n_inner = getattr(config, "n_inner", None) - n_inner = n_inner if n_inner is not None else 4 * config.hidden_size - - self.fc1 = ColumnParallelLinear( - config.hidden_size, - n_inner, - linear_method=linear_method, - ) - self.fc2 = RowParallelLinear( - n_inner, - config.hidden_size, - linear_method=linear_method, - ) - quant_config = getattr(linear_method, "quant_config", None) - self.act = get_act_fn(config.hidden_act, quant_config, n_inner) - - def forward(self, hidden_states): - hidden_states, _ = self.fc1(hidden_states) - hidden_states = self.act(hidden_states) - hidden_states, _ = self.fc2(hidden_states) - return hidden_states - - -class PhiLayer(nn.Module): - - def __init__(self, - config: PretrainedConfig, - linear_method: Optional[LinearMethodBase] = None): - super().__init__() - self.input_layernorm = nn.LayerNorm(config.hidden_size, - eps=config.layer_norm_eps) - self.self_attn = PhiAttention(config, linear_method) - self.mlp = PhiMLP(config, linear_method) - - def forward( - self, - position_ids: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - attn_outputs = self.self_attn( - position_ids=position_ids, - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - feed_forward_hidden_states = self.mlp(hidden_states) - hidden_states = attn_outputs + feed_forward_hidden_states + residual - return hidden_states - - -class PhiModel(nn.Module): - - def __init__(self, - config: PretrainedConfig, - linear_method: Optional[LinearMethodBase] = None): - super().__init__() - self.config = config - self.linear_method = linear_method - self.embed_tokens = VocabParallelEmbedding(config.vocab_size, - config.hidden_size) - self.layers = nn.ModuleList([ - PhiLayer(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - self.final_layernorm = nn.LayerNorm(config.hidden_size, - eps=config.layer_norm_eps) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) - for i in range(self.config.num_hidden_layers): - layer = self.layers[i] - hidden_states = layer( - positions, - hidden_states, - kv_caches[i], - input_metadata, - ) - - hidden_states = self.final_layernorm(hidden_states) - - return hidden_states - - -class PhiForCausalLM(nn.Module): - - def __init__(self, - config: PretrainedConfig, - linear_method: Optional[LinearMethodBase] = None): - super().__init__() - self.config = config - self.linear_method = linear_method - - self.model = PhiModel(config, linear_method) - - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - bias=True) - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - input_metadata) - - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - head = self.lm_head - next_tokens = self.sampler(head.weight, hidden_states, - sampling_metadata, head.bias) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v") - ] - params_dict = dict(self.named_parameters()) - - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "rotary_emb.inv_freq" in name: - continue - - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # pylint: disable=E1136 - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py deleted file mode 100644 index fbc7320fb45a46377fd6d717aadae82257e00ed0..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/qwen.py +++ /dev/null @@ -1,288 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py -# Copyright (c) Alibaba Cloud. -# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE -"""Inference-only QWen model compatible with HuggingFace weights.""" -from typing import Any, Dict, List, Optional, Tuple - -import torch -from torch import nn - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput -from vllm.transformers_utils.configs.qwen import QWenConfig - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class QWenMLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str = "silu", - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - linear_method=linear_method) - self.c_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - linear_method=linear_method) - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.c_proj(x) - return x - - -class QWenAttention(nn.Module): - - def __init__( - self, - hidden_size: int, - num_heads: int, - max_position_embeddings: int, - rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.hidden_size = hidden_size - tensor_model_parallel_world_size = get_tensor_model_parallel_world_size( - ) - self.total_num_heads = num_heads - assert self.total_num_heads % tensor_model_parallel_world_size == 0 - self.num_heads = (self.total_num_heads // - tensor_model_parallel_world_size) - self.head_dim = hidden_size // self.total_num_heads - self.c_attn = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - bias=True, - linear_method=linear_method, - ) - self.c_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - linear_method=linear_method, - ) - self.scaling = self.head_dim**-0.5 - - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, - ) - self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.c_attn(hidden_states) - q, k, v = qkv.chunk(chunks=3, dim=-1) - q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - - output, _ = self.c_proj(attn_output) - return output - - -class QWenBlock(nn.Module): - - def __init__( - self, - config: QWenConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - self.attn = QWenAttention(config.hidden_size, - config.num_attention_heads, - config.max_position_embeddings, - rope_theta=rope_theta, - rope_scaling=rope_scaling, - linear_method=linear_method) - - self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - - self.mlp = QWenMLP(config.hidden_size, - config.intermediate_size // 2, - linear_method=linear_method) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.ln_1(hidden_states) - else: - hidden_states, residual = self.ln_1(hidden_states, residual) - hidden_states = self.attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - - # Fully Connected - hidden_states, residual = self.ln_2(hidden_states, residual) - hidden_states = self.mlp(hidden_states) - return hidden_states, residual - - -class QWenModel(nn.Module): - - def __init__( - self, - config: QWenConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.vocab_size = config.vocab_size - - self.wte = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - ) - self.h = nn.ModuleList([ - QWenBlock(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.wte(input_ids) - residual = None - for i in range(len(self.h)): - layer = self.h[i] - hidden_states, residual = layer( - positions, - hidden_states, - kv_caches[i], - input_metadata, - residual, - ) - hidden_states, _ = self.ln_f(hidden_states, residual) - return hidden_states - - -class QWenLMHeadModel(nn.Module): - - def __init__( - self, - config: QWenConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.linear_method = linear_method - self.transformer = QWenModel(config, linear_method) - self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.transformer(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("gate_up_proj", "w2", 0), - ("gate_up_proj", "w1", 1), - ] - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "rotary_emb.inv_freq" in name: - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py deleted file mode 100644 index e823e6f8c3dbe3f632d5479b7fdcd24b04307eec..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/qwen2.py +++ /dev/null @@ -1,336 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py -# Copyright 2024 The Qwen team. -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only Qwen2 model compatible with HuggingFace weights.""" -from typing import List, Optional, Tuple - -import torch -from torch import nn -from transformers import Qwen2Config - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class Qwen2MLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - linear_method=linear_method) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - linear_method=linear_method) - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x - - -class Qwen2Attention(nn.Module): - - def __init__(self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - max_position: int = 4096 * 32, - rope_theta: float = 10000, - use_sliding_window: bool = False, - linear_method: Optional[LinearMethodBase] = None, - sliding_window: Optional[int] = None) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = hidden_size // self.total_num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.sliding_window = sliding_window if use_sliding_window else None - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=True, - linear_method=linear_method, - ) - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - linear_method=linear_method, - ) - - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position, - base=self.rope_theta, - ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - sliding_window=self.sliding_window) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.o_proj(attn_output) - return output - - -class Qwen2DecoderLayer(nn.Module): - - def __init__( - self, - config: Qwen2Config, - layer_idx: int, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - use_sliding_window = config.use_sliding_window and layer_idx < config.max_window_layers - self.self_attn = Qwen2Attention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - max_position=config.max_position_embeddings, - num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - use_sliding_window=use_sliding_window, - linear_method=linear_method, - sliding_window=config.sliding_window) - self.mlp = Qwen2MLP( - hidden_size=self.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - linear_method=linear_method, - ) - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) - hidden_states = self.mlp(hidden_states) - return hidden_states, residual - - -class Qwen2Model(nn.Module): - - def __init__( - self, - config: Qwen2Config, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - ) - self.layers = nn.ModuleList([ - Qwen2DecoderLayer(config, layer_idx, linear_method) - for layer_idx in range(config.num_hidden_layers) - ]) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) - residual = None - for i in range(len(self.layers)): - layer = self.layers[i] - hidden_states, residual = layer( - positions, - hidden_states, - kv_caches[i], - input_metadata, - residual, - ) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - -class Qwen2ForCausalLM(nn.Module): - - def __init__( - self, - config: Qwen2Config, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.config = config - self.linear_method = linear_method - self.model = Qwen2Model(config, linear_method) - self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "rotary_emb.inv_freq" in name: - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py deleted file mode 100644 index 95e5ad8ede63e6996a78be1979f3901ad5edb36c..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/stablelm.py +++ /dev/null @@ -1,299 +0,0 @@ -# coding=utf-8 -# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# This code is based off the following work: -# https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py -# https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json -"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights.""" -from typing import List, Optional, Tuple - -import torch -from torch import nn -from transformers import PretrainedConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.linear import (LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class StablelmMLP(nn.Module): - - def __init__(self, - config: PretrainedConfig, - linear_method: Optional[LinearMethodBase] = None) -> None: - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - self.gate_up_proj = MergedColumnParallelLinear( - config.hidden_size, [config.intermediate_size] * 2, - bias=False, - linear_method=linear_method) - self.down_proj = RowParallelLinear(config.intermediate_size, - config.hidden_size, - bias=False) - self.act_fn = SiluAndMul() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x - - -class StablelmAttention(nn.Module): - - def __init__(self, - config: PretrainedConfig, - linear_method: Optional[LinearMethodBase] = None) -> None: - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = config.num_attention_heads - self.num_heads = self.total_num_heads // tp_size - - self.total_num_key_value_heads = config.num_key_value_heads - if self.total_num_key_value_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_key_value_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_key_value_heads == 0 - self.num_key_value_heads = max( - 1, self.total_num_key_value_heads // tp_size) - self.head_dim = self.hidden_size // self.total_num_heads - self.max_position_embeddings = config.max_position_embeddings - self.rotary_ndims = int(self.head_dim * self.config.rope_pct) - self.scaling = self.head_dim**-0.5 - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_key_value_heads * self.head_dim - self.qkv_bias = getattr(config, "use_qkv_bias", False) - if (self.head_dim * self.num_heads * tp_size) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads}).") - - self.qkv_proj = QKVParallelLinear(self.hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_key_value_heads, - self.qkv_bias, - linear_method=linear_method) - self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, - self.hidden_size, - bias=False, - linear_method=linear_method) - self.rotary_ndims = int(self.head_dim * self.config.rope_pct) - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.rotary_ndims, - max_position=self.config.max_position_embeddings, - base=self.config.rope_theta, - ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_key_value_heads) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.o_proj(attn_output) - return output - - -class StablelmDecoderLayer(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.self_attn = StablelmAttention(config) - self.mlp = StablelmMLP(config, linear_method) - self.input_layernorm = nn.LayerNorm(config.hidden_size, - eps=config.norm_eps) - self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, - eps=config.norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Self Attention - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - return hidden_states, residual - - -class StableLMEpochModel(nn.Module): - - def __init__(self, - config: PretrainedConfig, - linear_method: Optional[LinearMethodBase] = None) -> None: - super().__init__() - # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - ) - self.layers = nn.ModuleList([ - StablelmDecoderLayer(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) - for i in range(len(self.layers)): - layer = self.layers[i] - hidden_states, residual = layer( - positions, - hidden_states, - kv_caches[i], - input_metadata, - ) - hidden_states = self.norm(hidden_states) - return hidden_states - - -class StablelmForCausalLM(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.config = config - self.linear_method = linear_method - self.model = StableLMEpochModel(config, linear_method) - self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "rotary_emb.inv_freq" in name: - continue - if ("rotary_emb.cos_cached" in name - or "rotary_emb.sin_cached" in name): - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/yi.py b/vllm/model_executor/models/yi.py deleted file mode 100644 index 53daa6c4cd939b7483bc2e3c254f568faf22d069..0000000000000000000000000000000000000000 --- a/vllm/model_executor/models/yi.py +++ /dev/null @@ -1,330 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only Yi model (https://01.ai) compatible with HuggingFace weights.""" -from typing import Any, Dict, List, Optional, Tuple - -import torch -from torch import nn -from vllm.transformers_utils.configs.yi import YiConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class YiMLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - linear_method=linear_method) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - linear_method=linear_method) - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x - - -class YiAttention(nn.Module): - - def __init__( - self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, - max_position_embeddings: int = 8192, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = hidden_size // self.total_num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.max_position_embeddings = max_position_embeddings - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=False, - linear_method=linear_method, - ) - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - linear_method=linear_method, - ) - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, - ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.o_proj(attn_output) - return output - - -class YiDecoderLayer(nn.Module): - - def __init__( - self, - config: YiConfig, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - max_position_embeddings = getattr(config, "max_position_embeddings", - 8192) - self.self_attn = YiAttention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, - max_position_embeddings=max_position_embeddings, - linear_method=linear_method, - ) - self.mlp = YiMLP( - hidden_size=self.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - linear_method=linear_method, - ) - self.ln1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.ln2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.ln1(hidden_states) - else: - hidden_states, residual = self.ln1(hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - - # Fully Connected - hidden_states, residual = self.ln2(hidden_states, residual) - hidden_states = self.mlp(hidden_states) - return hidden_states, residual - - -class YiModel(nn.Module): - - def __init__( - self, - config: YiConfig, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - ) - self.layers = nn.ModuleList([ - YiDecoderLayer(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) - residual = None - for i in range(len(self.layers)): - layer = self.layers[i] - hidden_states, residual = layer( - positions, - hidden_states, - kv_caches[i], - input_metadata, - residual, - ) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - -class YiForCausalLM(nn.Module): - - def __init__( - self, - config: YiConfig, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.config = config - self.linear_method = linear_method - self.model = YiModel(config, linear_method) - self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "rotary_emb.inv_freq" in name: - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/parallel_utils/README.md b/vllm/model_executor/parallel_utils/README.md deleted file mode 100644 index b25e3afddad9c9136d814a0eaf70fcf15dd21921..0000000000000000000000000000000000000000 --- a/vllm/model_executor/parallel_utils/README.md +++ /dev/null @@ -1 +0,0 @@ -The files in this folder are ported from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). We only keep the codes that are used in inference. \ No newline at end of file diff --git a/vllm/model_executor/parallel_utils/__init__.py b/vllm/model_executor/parallel_utils/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py deleted file mode 100644 index 65671994f3309322b4fd3aeddbfd47e67e708462..0000000000000000000000000000000000000000 --- a/vllm/model_executor/parallel_utils/communication_op.py +++ /dev/null @@ -1,198 +0,0 @@ -from collections import namedtuple -from typing import Any, Dict, List, Optional, Union - -from torch.distributed import ProcessGroup - -import torch - -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - get_tensor_model_parallel_group, -) -from vllm.model_executor.parallel_utils.custom_all_reduce import custom_all_reduce - - -def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: - """All-reduce the input tensor across model parallel group. - - NOTE: This operation will be applied in-place on the input tensor if - disable_custom_all_reduce is set to True. Otherwise, this operation may or - may not be applied in place depending on whether custom all reduce is - invoked for a particular tensor, which further depends on the tensor size - and GPU topology. - - TLDR: always assume this function modifies its input, but use the return - value as the output. - """ - # Bypass the function if we are using only 1 GPU. - if get_tensor_model_parallel_world_size() == 1: - return input_ - out = custom_all_reduce(input_) - if out is not None: - return out - torch.distributed.all_reduce(input_, - group=get_tensor_model_parallel_group()) - return input_ - - -def tensor_model_parallel_all_gather(input_: torch.Tensor, - dim: int = -1) -> torch.Tensor: - """All-gather the input tensor across model parallel group.""" - world_size = get_tensor_model_parallel_world_size() - # Bypass the function if we are using only 1 GPU. - if world_size == 1: - return input_ - assert -input_.dim() <= dim < input_.dim(), ( - f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") - if dim < 0: - # Convert negative dim to positive. - dim += input_.dim() - input_size = input_.size() - # Allocate output tensor. - output_tensor = torch.empty((world_size, ) + input_size, - dtype=input_.dtype, - device=input_.device) - # All-gather. - torch.distributed.all_gather_into_tensor( - output_tensor, input_, group=get_tensor_model_parallel_group()) - # Reshape - output_tensor = output_tensor.movedim(0, dim) - output_tensor = output_tensor.reshape(input_size[:dim] + - (world_size * input_size[dim], ) + - input_size[dim + 1:]) - return output_tensor - - -def tensor_model_parallel_gather(input_: torch.Tensor, - dst: int = 0, - dim: int = -1) -> torch.Tensor: - """Gather the input tensor across model parallel group. - - NOTE: We assume that the input tensor is on the same device across - all the ranks. - """ - world_size = get_tensor_model_parallel_world_size() - # Bypass the function if we are using only 1 GPU. - if world_size == 1: - return input_ - assert -input_.dim() <= dim < input_.dim(), ( - f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") - if dim < 0: - # Convert negative dim to positive. - dim += input_.dim() - # Allocate output tensor. - if get_tensor_model_parallel_rank() == dst: - gather_list = [torch.empty_like(input_) for _ in range(world_size)] - else: - gather_list = None - # Gather. - torch.distributed.gather(input_, - gather_list, - dst=dst, - group=get_tensor_model_parallel_group()) - if get_tensor_model_parallel_rank() == dst: - output_tensor = torch.cat(gather_list, dim=dim) - else: - output_tensor = None - return output_tensor - - -def broadcast(input_: torch.Tensor, - src: int = 0, - group: Optional[ProcessGroup] = None): - """Broadcast the input tensor.""" - group = group or torch.distributed.group.WORLD - ranks = torch.distributed.get_process_group_ranks(group) - assert src in ranks, f"Invalid src rank ({src})" - - # Bypass the function if we are using only 1 GPU. - world_size = torch.distributed.get_world_size(group=group) - if world_size == 1: - return input_ - # Broadcast. - torch.distributed.broadcast(input_, src=src, group=group) - return input_ - - -def broadcast_object_list(obj_list: List[Any], - src: int = 0, - group: Optional[ProcessGroup] = None): - """Broadcast the input object list.""" - group = group or torch.distributed.group.WORLD - ranks = torch.distributed.get_process_group_ranks(group) - assert src in ranks, f"Invalid src rank ({src})" - - # Bypass the function if we are using only 1 GPU. - world_size = torch.distributed.get_world_size(group=group) - if world_size == 1: - return obj_list - # Broadcast. - torch.distributed.broadcast_object_list(obj_list, src=src, group=group) - return obj_list - - -TensorMetadata = namedtuple("TensorMetadata", ["dtype", "size"]) - - -def broadcast_tensor_dict( - tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, - src: int = 0, - group: Optional[ProcessGroup] = None, -) -> Dict[Any, Union[torch.Tensor, Any]]: - """Broadcast the input tensor dictionary.""" - group = group or torch.distributed.group.WORLD - ranks = torch.distributed.get_process_group_ranks(group) - assert src in ranks, f"Invalid src rank ({src})" - - # Bypass the function if we are using only 1 GPU. - world_size = torch.distributed.get_world_size(group=group) - if world_size == 1: - return tensor_dict - - rank = torch.distributed.get_rank() - if rank == src: - assert isinstance( - tensor_dict, - dict), (f"Expecting a dictionary, got {type(tensor_dict)}") - metadata_list = [] - for key, value in tensor_dict.items(): - if isinstance(value, torch.Tensor): - assert value.is_cuda, ( - f"Tensor {key}: {value} is not on cuda. Currently we only " - f"support broadcasting tensors on cuda.") - metadata_list.append( - (key, TensorMetadata(value.dtype, value.size()))) - else: - metadata_list.append((key, value)) - torch.distributed.broadcast_object_list([metadata_list], - src=src, - group=group) - for key, value in metadata_list: - if isinstance(value, TensorMetadata): - tensor = tensor_dict[key] - torch.distributed.broadcast(tensor, src=src) - else: - recv_metadata_list = [None] - torch.distributed.broadcast_object_list(recv_metadata_list, - src=src, - group=group) - metadata_list = recv_metadata_list[0] - tensor_dict = {} - async_handles = [] - for key, value in metadata_list: - if isinstance(value, TensorMetadata): - tensor = torch.empty(value.size, - dtype=value.dtype, - device="cuda") - async_handle = torch.distributed.broadcast(tensor, - src=src, - async_op=True, - group=group) - async_handles.append(async_handle) - tensor_dict[key] = tensor - else: - tensor_dict[key] = value - for async_handle in async_handles: - async_handle.wait() - return tensor_dict diff --git a/vllm/model_executor/parallel_utils/custom_all_reduce.py b/vllm/model_executor/parallel_utils/custom_all_reduce.py deleted file mode 100644 index 5b88649cc2129c4fdddb88eaa9280390c303b38f..0000000000000000000000000000000000000000 --- a/vllm/model_executor/parallel_utils/custom_all_reduce.py +++ /dev/null @@ -1,223 +0,0 @@ -from contextlib import contextmanager -from typing import Optional - -import torch -import torch.distributed as dist - -from vllm.logger import init_logger -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank) - -try: - from vllm._C import custom_ar - import pynvml -except ImportError: - # For AMD GPUs - custom_ar = None - pynvml = None - -logger = init_logger(__name__) - -_CA_HANDLE = None -_IS_CAPTURING = False -_SUPPORTED_WORLD_SIZES = [2, 4, 6, 8] - - -def init_custom_ar() -> None: - global _CA_HANDLE - if _CA_HANDLE is not None: - return - rank = get_tensor_model_parallel_rank() - world_size = get_tensor_model_parallel_world_size() - if world_size not in _SUPPORTED_WORLD_SIZES: - logger.warn( - "Custom allreduce is disabled due to an unsupported world size: " - "%d. Supported world sizes: %s. To slience this warning, specify" - "disable_custom_all_reduce=True explicitly.", world_size, - str(_SUPPORTED_WORLD_SIZES)) - return - if not _can_p2p(rank, world_size): - logger.warn( - "Custom allreduce is disabled because your platform lacks GPU P2P" - " capability. To slience this warning, specify" - "disable_custom_all_reduce=True explicitly.") - return - _CA_HANDLE = CustomAllreduce(rank, world_size) - - -def begin_capture() -> None: - global _IS_CAPTURING - _IS_CAPTURING = True - - -def end_capture() -> None: - global _IS_CAPTURING - _IS_CAPTURING = False - - -def is_capturing() -> bool: - return _IS_CAPTURING and _CA_HANDLE is not None - - -def get_handle() -> Optional["CustomAllreduce"]: - return _CA_HANDLE - - -@contextmanager -def capture(): - try: - begin_capture() - yield - finally: - end_capture() - handle = get_handle() - if handle is not None: - handle.register_graph_buffers() - - -def custom_all_reduce(input: torch.Tensor) -> Optional[torch.Tensor]: - ca_handle = get_handle() - # when custom allreduce is disabled, this will be None - if ca_handle is None: - return - if is_capturing(): - if torch.cuda.is_current_stream_capturing(): - if ca_handle.should_custom_ar(input): - return ca_handle.all_reduce_reg(input) - else: - if ca_handle.should_custom_ar(input): - # if warm up, mimic the allocation pattern - # since custom allreduce is out-of-place - return torch.empty_like(input) - else: - # note: outside of cuda graph context, - # custom allreduce incurs a cost of cudaMemcpy, which should - # be small(<=1% of overall latency) compared to the performance - # gains of using custom kernels - if ca_handle.should_custom_ar(input): - return ca_handle.all_reduce_unreg(input) - - -@contextmanager -def _nvml(): - try: - pynvml.nvmlInit() - yield - finally: - pynvml.nvmlShutdown() - - -# query if the set of gpus are fully connected by nvlink (1 hop) -@_nvml() -def _is_full_nvlink(rank, world_size): - handle = pynvml.nvmlDeviceGetHandleByIndex(rank) - for i in range(world_size): - if i != rank: - try: - link_state = pynvml.nvmlDeviceGetNvLinkState(handle, i) - if not link_state: - return False - except pynvml.NVMLError as error: - logger.info( - f"NVLink detection failed with message \"{str(error)}\". " - "This is normal if your machine has no NVLink equipped") - return False - return True - - -def _can_p2p(rank: int, world_size: int) -> bool: - for i in range(world_size): - if i == rank: - continue - if not torch.cuda.can_device_access_peer(rank, i): - return False - return True - - -class CustomAllreduce: - - # max_size: max supported allreduce size - def __init__(self, rank, world_size, max_size=8192 * 1024) -> None: - # buffers memory are owned by this Python class and passed to C++ - # meta data composes of two parts: meta data for synchronization - # (256 bytes) and a temporary buffer for storing intermediate - # allreduce results. - self.meta = torch.zeros(custom_ar.meta_size() + max_size, - dtype=torch.uint8, - device="cuda") - # This is a pre-registered IPC buffer. In eager mode, input tensors - # are first copied into this buffer before allreduce is performed - self.buffer = torch.empty(max_size, dtype=torch.uint8, device="cuda") - # This is a buffer for storing the tuples of pointers pointing to - # IPC buffers from all ranks. Each registered tuple has size of - # 8*world_size bytes where world_size is at most 8. Allocating 8MB - # is enough for 131072 such tuples. The largest model I've seen only - # needs less than 10000 of registered tuples. - self.rank_data = torch.empty(8 * 1024 * 1024, - dtype=torch.uint8, - device="cuda") - self.max_size = max_size - self.world_size = world_size - handles, offsets = self._get_ipc_meta(self.meta) - self.full_nvlink = _is_full_nvlink(rank, world_size) - self._ptr = custom_ar.init_custom_ar(self.meta, self.rank_data, - handles, offsets, rank, - self.full_nvlink) - self.fast_cond = self.full_nvlink or world_size <= 2 - self.register_buffer(self.buffer) - - def _get_ipc_meta(self, inp: torch.Tensor): - data = inp.untyped_storage()._share_cuda_() - shard_data = ( - data[1], # ipc handle to base ptr - data[3], # offset of base ptr - ) - return self._gather_ipc_meta(shard_data) - - def _gather_ipc_meta(self, shard_data): - all_data = [None] * self.world_size - dist.all_gather_object(all_data, shard_data) - - handles = [] - offsets = [] - for i in range(len(all_data)): - handles.append(all_data[i][0]) - offsets.append(all_data[i][1]) - return handles, offsets - - def register_buffer(self, inp: torch.Tensor): - handles, offsets = self._get_ipc_meta(inp) - custom_ar.register_buffer(self._ptr, inp, handles, offsets) - - def register_graph_buffers(self): - handle, offset = custom_ar.get_graph_buffer_ipc_meta(self._ptr) - handles, offsets = self._gather_ipc_meta((bytes(handle), offset)) - logger.info("Registering %d cuda graph addresses", len(offset)) - custom_ar.register_graph_buffers(self._ptr, handles, offsets) - - def should_custom_ar(self, inp: torch.Tensor): - return custom_ar.should_custom_ar(inp, self.max_size, self.world_size, - self.full_nvlink) - - # all reduce, assuming inp tensor is IPC registered with register_buffer, - # or, in the context of cuda graphs, register_graph_buffers - def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None): - if out is None: - out = torch.empty_like(inp) - custom_ar.all_reduce_reg(self._ptr, inp, out) - return out - - # all reduce, assuming inp tensor is NOT IPC registered - def all_reduce_unreg(self, inp: torch.Tensor, out: torch.Tensor = None): - if out is None: - out = torch.empty_like(inp) - custom_ar.all_reduce_unreg(self._ptr, inp, self.buffer, out) - return out - - def close(self): - if self._ptr: - custom_ar.dispose(self._ptr) - self._ptr = 0 - - def __del__(self): - self.close() diff --git a/vllm/model_executor/parallel_utils/parallel_state.py b/vllm/model_executor/parallel_utils/parallel_state.py deleted file mode 100644 index 46bff7e16b79f966171a7622c5e37aa6702beaef..0000000000000000000000000000000000000000 --- a/vllm/model_executor/parallel_utils/parallel_state.py +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright 2023 The vLLM team. -# Adapted from -# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -"""Tensor and pipeline parallel groups.""" - -import torch - -# Tensor model parallel group that the current rank belongs to. -_TENSOR_MODEL_PARALLEL_GROUP = None -# Pipeline model parallel group that the current rank belongs to. -_PIPELINE_MODEL_PARALLEL_GROUP = None - -# A list of global ranks for each pipeline group to ease calculation of the -# source rank when broadcasting from the first or last pipeline stage. -_PIPELINE_GLOBAL_RANKS = None - - -def initialize_model_parallel( - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 1, -) -> None: - """ - Initialize model parallel groups. - - Arguments: - tensor_model_parallel_size: number of GPUs used for tensor model - parallelism. - pipeline_model_parallel_size: number of GPUs used for pipeline model - parallelism. - - Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we - use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize - the model pipeline. The present function will - create 4 tensor model-parallel groups and 2 pipeline model-parallel groups: - 4 tensor model-parallel groups: - [g0, g1], [g2, g3], [g4, g5], [g6, g7] - 2 pipeline model-parallel groups: - [g0, g2, g4, g6], [g1, g3, g5, g7] - Note that for efficiency, the caller should make sure adjacent ranks - are on the same DGX box. For example if we are using 2 DGX-1 boxes - with a total of 16 GPUs, rank 0 to 7 belong to the first box and - ranks 8 to 15 belong to the second box. - """ - # Get world size and rank. Ensure some consistencies. - assert torch.distributed.is_initialized() - world_size: int = torch.distributed.get_world_size() - - if (world_size != - tensor_model_parallel_size * pipeline_model_parallel_size): - raise RuntimeError( - f"world_size ({world_size}) is not equal to " - f"tensor_model_parallel_size ({tensor_model_parallel_size}) x " - f"pipeline_model_parallel_size ({pipeline_model_parallel_size})") - - num_tensor_model_parallel_groups: int = (world_size // - tensor_model_parallel_size) - num_pipeline_model_parallel_groups: int = (world_size // - pipeline_model_parallel_size) - rank = torch.distributed.get_rank() - - # Build the tensor model-parallel groups. - global _TENSOR_MODEL_PARALLEL_GROUP - assert _TENSOR_MODEL_PARALLEL_GROUP is None, ( - "tensor model parallel group is already initialized") - for i in range(num_tensor_model_parallel_groups): - ranks = range(i * tensor_model_parallel_size, - (i + 1) * tensor_model_parallel_size) - group = torch.distributed.new_group(ranks) - if rank in ranks: - _TENSOR_MODEL_PARALLEL_GROUP = group - - # Build the pipeline model-parallel groups. - global _PIPELINE_MODEL_PARALLEL_GROUP - global _PIPELINE_GLOBAL_RANKS - assert _PIPELINE_MODEL_PARALLEL_GROUP is None, ( - "pipeline model parallel group is already initialized") - for i in range(num_pipeline_model_parallel_groups): - ranks = range(i, world_size, num_pipeline_model_parallel_groups) - group = torch.distributed.new_group(ranks) - if rank in ranks: - _PIPELINE_MODEL_PARALLEL_GROUP = group - _PIPELINE_GLOBAL_RANKS = ranks - - -def ensure_model_parallel_initialized( - tensor_model_parallel_size: int, - pipeline_model_parallel_size: int, -) -> None: - """Helper to initialize model parallel groups if they are not initialized, - or ensure tensor-parallel and pipeline-parallel sizes are equal to expected - values if the model parallel groups are initialized. - """ - if not model_parallel_is_initialized(): - initialize_model_parallel(tensor_model_parallel_size, - pipeline_model_parallel_size) - return - - assert ( - get_tensor_model_parallel_world_size() == tensor_model_parallel_size - ), ("tensor parallel group already initialized, but of unexpected size: " - f"{get_tensor_model_parallel_world_size()=} vs. " - f"{tensor_model_parallel_size=}") - assert (get_pipeline_model_parallel_world_size( - ) == pipeline_model_parallel_size), ( - "pipeline parallel group already initialized, but of unexpected size: " - f"{get_pipeline_model_parallel_world_size()=} vs. " - f"{pipeline_model_parallel_size=}") - - -def model_parallel_is_initialized(): - """Check if tensor and pipeline parallel groups are initialized.""" - return (_TENSOR_MODEL_PARALLEL_GROUP is not None - and _PIPELINE_MODEL_PARALLEL_GROUP is not None) - - -def get_tensor_model_parallel_group(): - """Get the tensor model parallel group the caller rank belongs to.""" - assert _TENSOR_MODEL_PARALLEL_GROUP is not None, ( - "tenosr model parallel group is not initialized") - return _TENSOR_MODEL_PARALLEL_GROUP - - -def get_pipeline_model_parallel_group(): - """Get the pipeline model parallel group the caller rank belongs to.""" - assert _PIPELINE_MODEL_PARALLEL_GROUP is not None, ( - "pipeline model parallel group is not initialized") - return _PIPELINE_MODEL_PARALLEL_GROUP - - -def get_tensor_model_parallel_world_size(): - """Return world size for the tensor model parallel group.""" - return torch.distributed.get_world_size( - group=get_tensor_model_parallel_group()) - - -def get_pipeline_model_parallel_world_size(): - """Return world size for the pipeline model parallel group.""" - return torch.distributed.get_world_size( - group=get_pipeline_model_parallel_group()) - - -def get_tensor_model_parallel_rank(): - """Return my rank for the tensor model parallel group.""" - return torch.distributed.get_rank(group=get_tensor_model_parallel_group()) - - -def get_pipeline_model_parallel_rank(): - """Return my rank for the pipeline model parallel group.""" - return torch.distributed.get_rank( - group=get_pipeline_model_parallel_group()) - - -def get_tensor_model_parallel_src_rank(): - """Calculate the global rank corresponding to the first local rank - in the tensor model parallel group.""" - global_rank = torch.distributed.get_rank() - local_world_size = get_tensor_model_parallel_world_size() - return (global_rank // local_world_size) * local_world_size - - -def get_pipeline_model_parallel_first_rank(): - """Return the global rank of the first process in the pipeline for the - current tensor parallel group""" - assert _PIPELINE_GLOBAL_RANKS is not None, ( - "Pipeline parallel group is not initialized") - return _PIPELINE_GLOBAL_RANKS[0] - - -def get_pipeline_model_parallel_last_rank(): - """Return the global rank of the last process in the pipeline for the - current tensor parallel group""" - assert _PIPELINE_GLOBAL_RANKS is not None, ( - "Pipeline parallel group is not initialized") - last_rank_local = get_pipeline_model_parallel_world_size() - 1 - return _PIPELINE_GLOBAL_RANKS[last_rank_local] - - -def get_pipeline_model_parallel_next_rank(): - """Return the global rank that follows the caller in the pipeline""" - assert _PIPELINE_GLOBAL_RANKS is not None, ( - "Pipeline parallel group is not initialized") - rank_in_pipeline = get_pipeline_model_parallel_rank() - world_size = get_pipeline_model_parallel_world_size() - return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size] - - -def get_pipeline_model_parallel_prev_rank(): - """Return the global rank that preceeds the caller in the pipeline""" - assert _PIPELINE_GLOBAL_RANKS is not None, ( - "Pipeline parallel group is not initialized") - rank_in_pipeline = get_pipeline_model_parallel_rank() - world_size = get_pipeline_model_parallel_world_size() - return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size] - - -def destroy_model_parallel(): - """Set the groups to none and destroy them.""" - global _TENSOR_MODEL_PARALLEL_GROUP - if _TENSOR_MODEL_PARALLEL_GROUP: - torch.distributed.destroy_process_group(_TENSOR_MODEL_PARALLEL_GROUP) - _TENSOR_MODEL_PARALLEL_GROUP = None - global _PIPELINE_MODEL_PARALLEL_GROUP - if _PIPELINE_MODEL_PARALLEL_GROUP: - torch.distributed.destroy_process_group(_PIPELINE_MODEL_PARALLEL_GROUP) - _PIPELINE_MODEL_PARALLEL_GROUP = None - global _PIPELINE_GLOBAL_RANKS - _PIPELINE_GLOBAL_RANKS = None diff --git a/vllm/model_executor/parallel_utils/utils.py b/vllm/model_executor/parallel_utils/utils.py deleted file mode 100644 index 0cd420c8e11b564c45f9ad53e087dd3ce93456c1..0000000000000000000000000000000000000000 --- a/vllm/model_executor/parallel_utils/utils.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright 2023 The vLLM team. -# Adapted from -# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -from typing import Sequence - -import torch - - -def ensure_divisibility(numerator, denominator): - """Ensure that numerator is divisible by the denominator.""" - assert numerator % denominator == 0, "{} is not divisible by {}".format( - numerator, denominator) - - -def divide(numerator, denominator): - """Ensure that numerator is divisible by the denominator and return - the division value.""" - ensure_divisibility(numerator, denominator) - return numerator // denominator - - -def split_tensor_along_last_dim( - tensor: torch.Tensor, - num_partitions: int, - contiguous_split_chunks: bool = False, -) -> Sequence[torch.Tensor]: - """ Split a tensor along its last dimension. - - Arguments: - tensor: input tensor. - num_partitions: number of partitions to split the tensor - contiguous_split_chunks: If True, make each chunk contiguous - in memory. - - Returns: - A list of Tensors - """ - # Get the size and dimension. - last_dim = tensor.dim() - 1 - last_dim_size = divide(tensor.size()[last_dim], num_partitions) - # Split. - tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) - # NOTE: torch.split does not create contiguous tensors by default. - if contiguous_split_chunks: - return tuple(chunk.contiguous() for chunk in tensor_list) - - return tensor_list diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py deleted file mode 100644 index 2d41d40e046784632d70bd861ef6f3f4a53aa400..0000000000000000000000000000000000000000 --- a/vllm/model_executor/sampling_metadata.py +++ /dev/null @@ -1,236 +0,0 @@ -from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple - -import torch - -from vllm.sampling_params import SamplingParams, SamplingType -from vllm.sequence import SequenceData -from vllm.utils import in_wsl - -_SAMPLING_EPS = 1e-5 - - -class SamplingMetadata: - """Metadata for input sequences. Used in sampler. - - Args: - seq_groups: List of (seq_ids, sampling_params). - seq_data: Seq_id -> SequenceData. - prompt_lens: Lengths of prompts. - selected_token_indices: Token indices selected for sampling. - categorized_sample_indices: SamplingType -> token indices to sample. - perform_sampling: Whether to perform sampling. This option is used to - make the sampling only happens in the driver worker, and disable - sampling in other worker processes. - """ - - def __init__( - self, - seq_groups: Optional[List[Tuple[List[int], SamplingParams]]], - seq_data: Optional[Dict[int, SequenceData]], - prompt_lens: Optional[List[int]], - selected_token_indices: torch.Tensor, - categorized_sample_indices: Optional[Dict[SamplingType, torch.Tensor]], - perform_sampling: bool = True, - ) -> None: - self.seq_groups = seq_groups - self.seq_data = seq_data - self.prompt_lens = prompt_lens - self.selected_token_indices = selected_token_indices - self.categorized_sample_indices = categorized_sample_indices - self.perform_sampling = perform_sampling - - self.num_prompts = len(prompt_lens) if prompt_lens is not None else 0 - - def __repr__(self) -> str: - return ( - "SamplingMetadata(" - f"seq_groups={self.seq_groups}, " - f"seq_data={self.seq_data}, " - f"prompt_lens={self.prompt_lens}, " - f"selected_token_indices={self.selected_token_indices}, " - f"categorized_sample_indices={self.categorized_sample_indices}), " - f"perform_sampling={self.perform_sampling})") - - -@dataclass -class SamplingTensors: - """Tensors for sampling.""" - - temperatures: torch.Tensor - top_ps: torch.Tensor - top_ks: torch.Tensor - min_ps: torch.Tensor - presence_penalties: torch.Tensor - frequency_penalties: torch.Tensor - repetition_penalties: torch.Tensor - prompt_tokens: torch.Tensor - output_tokens: torch.Tensor - - @classmethod - def from_sampling_metadata( - cls, sampling_metadata: "SamplingMetadata", vocab_size: int, - device: torch.device, - dtype: torch.dtype) -> Tuple["SamplingTensors", bool, bool, bool]: - prompt_tokens: List[List[int]] = [] - output_tokens: List[List[int]] = [] - top_ks: List[int] = [] - temperatures: List[float] = [] - top_ps: List[float] = [] - min_ps: List[float] = [] - presence_penalties: List[float] = [] - frequency_penalties: List[float] = [] - repetition_penalties: List[float] = [] - do_penalties = False - do_top_p_top_k = False - do_min_p = False - for i, seq_group in enumerate(sampling_metadata.seq_groups): - seq_ids, sampling_params = seq_group - temperature = sampling_params.temperature - p = sampling_params.presence_penalty - f = sampling_params.frequency_penalty - r = sampling_params.repetition_penalty - top_p = sampling_params.top_p - min_p = sampling_params.min_p - # k should not be greater than the vocab size. - top_k = min(sampling_params.top_k, vocab_size) - top_k = vocab_size if top_k == -1 else top_k - if temperature < _SAMPLING_EPS: - # NOTE: Zero temperature means deterministic sampling - # (i.e., greedy sampling or beam search). - # Set the temperature to 1 to avoid division by zero. - temperature = 1.0 - if not do_top_p_top_k and (top_p < 1.0 - _SAMPLING_EPS - or top_k != vocab_size): - do_top_p_top_k = True - if not do_min_p and min_p > _SAMPLING_EPS: - do_min_p = True - if not do_penalties and (abs(p) >= _SAMPLING_EPS - or abs(f) >= _SAMPLING_EPS - or abs(r - 1.0) >= _SAMPLING_EPS): - do_penalties = True - if (i < sampling_metadata.num_prompts - and sampling_params.prompt_logprobs is not None): - # For tokens in the prompt that we only need to get their logprobs - prompt_len = sampling_metadata.prompt_lens[i] - temperatures += [temperature] * (prompt_len - 1) - top_ps += [top_p] * (prompt_len - 1) - top_ks += [top_k] * (prompt_len - 1) - min_ps += [min_p] * (prompt_len - 1) - presence_penalties += [0] * (prompt_len - 1) - frequency_penalties += [0] * (prompt_len - 1) - repetition_penalties += [1] * (prompt_len - 1) - prompt_tokens.extend([] for _ in range(prompt_len - 1)) - output_tokens.extend([] for _ in range(prompt_len - 1)) - for seq_id in seq_ids: - seq_data = sampling_metadata.seq_data[seq_id] - prompt_tokens.append(seq_data.prompt_token_ids) - output_tokens.append(seq_data.output_token_ids) - temperatures += [temperature] * len(seq_ids) - top_ps += [top_p] * len(seq_ids) - top_ks += [top_k] * len(seq_ids) - min_ps += [min_p] * len(seq_ids) - presence_penalties += [p] * len(seq_ids) - frequency_penalties += [f] * len(seq_ids) - repetition_penalties += [r] * len(seq_ids) - - sampling_tensors = SamplingTensors.from_lists( - temperatures, top_ps, top_ks, min_ps, presence_penalties, - frequency_penalties, repetition_penalties, prompt_tokens, - output_tokens, vocab_size, device, dtype) - return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p) - - @classmethod - def from_lists(cls, temperatures: List[float], top_ps: List[float], - top_ks: List[int], min_ps: List[float], - presence_penalties: List[float], - frequency_penalties: List[float], - repetition_penalties: List[float], - prompt_tokens: List[List[int]], - output_tokens: List[List[int]], vocab_size: int, - device: torch.device, - dtype: torch.dtype) -> "SamplingTensors": - # Note that the performance will be very bad without - # pinned memory. - pin_memory = not in_wsl() - prompt_max_len = max(len(tokens) for tokens in prompt_tokens) - prompt_padded_tokens = [ - tokens + [vocab_size] * (prompt_max_len - len(tokens)) - for tokens in prompt_tokens - ] - output_max_len = max(len(tokens) for tokens in output_tokens) - output_padded_tokens = [ - tokens + [vocab_size] * (output_max_len - len(tokens)) - for tokens in output_tokens - ] - - temperatures_t = torch.tensor( - temperatures, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - top_ps_t = torch.tensor( - top_ps, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - min_ps_t = torch.tensor( - min_ps, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - presence_penalties_t = torch.tensor( - presence_penalties, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - frequency_penalties_t = torch.tensor( - frequency_penalties, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - repetition_penalties_t = torch.tensor( - repetition_penalties, - device="cpu", - dtype=dtype, - pin_memory=pin_memory, - ) - top_ks_t = torch.tensor( - top_ks, - device="cpu", - dtype=torch.int, - pin_memory=pin_memory, - ) - prompt_tensor = torch.tensor( - prompt_padded_tokens, - device="cpu", - dtype=torch.long, - pin_memory=pin_memory, - ) - output_tensor = torch.tensor( - output_padded_tokens, - device="cpu", - dtype=torch.long, - pin_memory=pin_memory, - ) - # Because the memory is pinned, we can do non-blocking - # transfer to device. - return cls( - temperatures=temperatures_t.to(device=device, non_blocking=True), - top_ps=top_ps_t.to(device=device, non_blocking=True), - top_ks=top_ks_t.to(device=device, non_blocking=True), - min_ps=min_ps_t.to(device=device, non_blocking=True), - presence_penalties=presence_penalties_t.to(device=device, - non_blocking=True), - frequency_penalties=frequency_penalties_t.to(device=device, - non_blocking=True), - repetition_penalties=repetition_penalties_t.to(device=device, - non_blocking=True), - prompt_tokens=prompt_tensor.to(device=device, non_blocking=True), - output_tokens=output_tensor.to(device=device, non_blocking=True), - ) diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py deleted file mode 100644 index 336bc1cd005cf3f4bf8eb727bfb782ac6cc1f109..0000000000000000000000000000000000000000 --- a/vllm/model_executor/utils.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Utils for model executor.""" -import random -from typing import Any, Dict, Optional - -import numpy as np -import torch - - -def set_random_seed(seed: int) -> None: - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(seed) - - -def set_weight_attrs( - weight: torch.Tensor, - weight_attrs: Optional[Dict[str, Any]], -): - """Set attributes on a weight tensor. - - This method is used to set attributes on a weight tensor. This method - will not overwrite existing attributes. - - Args: - weight: The weight tensor. - weight_attrs: A dictionary of attributes to set on the weight tensor. - """ - if weight_attrs is None: - return - for key, value in weight_attrs.items(): - assert not hasattr( - weight, key), (f"Overwriting existing tensor attribute: {key}") - setattr(weight, key, value) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py deleted file mode 100644 index 8e6f7a174f219e282d77ecc6f7c49dad162d58c6..0000000000000000000000000000000000000000 --- a/vllm/model_executor/weight_utils.py +++ /dev/null @@ -1,301 +0,0 @@ -"""Utilities for downloading and initializing model weights.""" -import filelock -import glob -import fnmatch -import json -import os -from collections import defaultdict -from typing import Any, Iterator, List, Optional, Tuple - -from huggingface_hub import snapshot_download, HfFileSystem -import numpy as np -from safetensors.torch import load_file, save_file, safe_open -import torch -from transformers import PretrainedConfig -from tqdm.auto import tqdm - -from vllm.logger import init_logger -from vllm.model_executor.layers.quantization import (get_quantization_config, - QuantizationConfig) - -logger = init_logger(__name__) - - -class Disabledtqdm(tqdm): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs, disable=True) - - -def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None): - lock_dir = cache_dir if cache_dir is not None else "/tmp" - lock_file_name = model_name_or_path.replace("/", "-") + ".lock" - lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name)) - return lock - - -def _shared_pointers(tensors): - ptrs = defaultdict(list) - for k, v in tensors.items(): - ptrs[v.data_ptr()].append(k) - failing = [] - for _, names in ptrs.items(): - if len(names) > 1: - failing.append(names) - return failing - - -def convert_bin_to_safetensor_file( - pt_filename: str, - sf_filename: str, -) -> None: - loaded = torch.load(pt_filename, map_location="cpu") - if "state_dict" in loaded: - loaded = loaded["state_dict"] - shared = _shared_pointers(loaded) - for shared_weights in shared: - for name in shared_weights[1:]: - loaded.pop(name) - - # For tensors to be contiguous - loaded = {k: v.contiguous() for k, v in loaded.items()} - - dirname = os.path.dirname(sf_filename) - os.makedirs(dirname, exist_ok=True) - save_file(loaded, sf_filename, metadata={"format": "pt"}) - - # check file size - sf_size = os.stat(sf_filename).st_size - pt_size = os.stat(pt_filename).st_size - if (sf_size - pt_size) / pt_size > 0.01: - raise RuntimeError(f"""The file size different is more than 1%: - - {sf_filename}: {sf_size} - - {pt_filename}: {pt_size} - """) - - # check if the tensors are the same - reloaded = load_file(sf_filename) - for k in loaded: - pt_tensor = loaded[k] - sf_tensor = reloaded[k] - if not torch.equal(pt_tensor, sf_tensor): - raise RuntimeError(f"The output tensors do not match for key {k}") - - -# TODO(woosuk): Move this to other place. -def get_quant_config( - quantization: str, - model_name_or_path: str, - hf_config: PretrainedConfig, - cache_dir: Optional[str] = None, -) -> QuantizationConfig: - quant_cls = get_quantization_config(quantization) - # Read the quantization config from the HF model config, if available. - hf_quant_config = getattr(hf_config, "quantization_config", None) - if hf_quant_config is not None: - return quant_cls.from_config(hf_quant_config) - - is_local = os.path.isdir(model_name_or_path) - if not is_local: - # Download the config files. - with get_lock(model_name_or_path, cache_dir): - hf_folder = snapshot_download(model_name_or_path, - allow_patterns="*.json", - cache_dir=cache_dir, - tqdm_class=Disabledtqdm) - else: - hf_folder = model_name_or_path - config_files = glob.glob(os.path.join(hf_folder, "*.json")) - - quant_config_files = [ - f for f in config_files if any( - f.endswith(x) for x in quant_cls.get_config_filenames()) - ] - if len(quant_config_files) == 0: - raise ValueError(f"Cannot find the config file for {quantization}") - if len(quant_config_files) > 1: - raise ValueError(f"Found multiple config files for {quantization}: " - f"{quant_config_files}") - - quant_config_file = quant_config_files[0] - with open(quant_config_file, "r") as f: - config = json.load(f) - return quant_cls.from_config(config) - - -def prepare_hf_model_weights( - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - fall_back_to_pt: bool = True, - revision: Optional[str] = None, -) -> Tuple[str, List[str], bool]: - # Download model weights from huggingface. - is_local = os.path.isdir(model_name_or_path) - use_safetensors = False - # Some quantized models use .pt files for storing the weights. - if load_format == "auto": - allow_patterns = ["*.safetensors", "*.bin"] - elif load_format == "safetensors": - use_safetensors = True - allow_patterns = ["*.safetensors"] - elif load_format == "pt": - allow_patterns = ["*.pt"] - elif load_format == "npcache": - allow_patterns = ["*.bin"] - else: - raise ValueError(f"Unknown load_format: {load_format}") - - if fall_back_to_pt: - allow_patterns += ["*.pt"] - - if not is_local: - # Before we download we look at that is available: - fs = HfFileSystem() - file_list = fs.ls(model_name_or_path, detail=False, revision=revision) - - # depending on what is available we download different things - for pattern in allow_patterns: - matching = fnmatch.filter(file_list, pattern) - if len(matching) > 0: - allow_patterns = [pattern] - break - - logger.info(f"Using model weights format {allow_patterns}") - # Use file lock to prevent multiple processes from - # downloading the same model weights at the same time. - with get_lock(model_name_or_path, cache_dir): - hf_folder = snapshot_download(model_name_or_path, - allow_patterns=allow_patterns, - cache_dir=cache_dir, - tqdm_class=Disabledtqdm, - revision=revision) - else: - hf_folder = model_name_or_path - hf_weights_files: List[str] = [] - for pattern in allow_patterns: - hf_weights_files += glob.glob(os.path.join(hf_folder, pattern)) - if len(hf_weights_files) > 0: - if pattern == "*.safetensors": - use_safetensors = True - break - if not use_safetensors: - # Exclude files that are not needed for inference. - # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233 - blacklist = [ - "training_args.bin", - "optimizer.bin", - "optimizer.pt", - "scheduler.pt", - "scaler.pt", - ] - hf_weights_files = [ - f for f in hf_weights_files - if not any(f.endswith(x) for x in blacklist) - ] - - if len(hf_weights_files) == 0: - raise RuntimeError( - f"Cannot find any model weights with `{model_name_or_path}`") - - return hf_folder, hf_weights_files, use_safetensors - - -def hf_model_weights_iterator( - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None, - fall_back_to_pt: Optional[bool] = True, -) -> Iterator[Tuple[str, torch.Tensor]]: - hf_folder, hf_weights_files, use_safetensors = prepare_hf_model_weights( - model_name_or_path, - cache_dir=cache_dir, - load_format=load_format, - fall_back_to_pt=fall_back_to_pt, - revision=revision) - - if load_format == "npcache": - # Currently np_cache only support *.bin checkpoints - assert use_safetensors is False - - # Convert the model weights from torch tensors to numpy arrays for - # faster loading. - np_folder = os.path.join(hf_folder, "np") - os.makedirs(np_folder, exist_ok=True) - weight_names_file = os.path.join(np_folder, "weight_names.json") - # Use file lock to prevent multiple processes from - # dumping the same model weights to numpy at the same time. - with get_lock(model_name_or_path, cache_dir): - if not os.path.exists(weight_names_file): - weight_names = [] - for bin_file in hf_weights_files: - state = torch.load(bin_file, map_location="cpu") - for name, param in state.items(): - param_path = os.path.join(np_folder, name) - with open(param_path, "wb") as f: - np.save(f, param.cpu().detach().numpy()) - weight_names.append(name) - with open(weight_names_file, "w") as f: - json.dump(weight_names, f) - - with open(weight_names_file, "r") as f: - weight_names = json.load(f) - - for name in weight_names: - param_path = os.path.join(np_folder, name) - with open(param_path, "rb") as f: - param = np.load(f) - yield name, torch.from_numpy(param) - elif use_safetensors: - for st_file in hf_weights_files: - with safe_open(st_file, framework="pt") as f: - for name in f.keys(): # noqa: SIM118 - param = f.get_tensor(name) - yield name, param - else: - for bin_file in hf_weights_files: - state = torch.load(bin_file, map_location="cpu") - for name, param in state.items(): - yield name, param - del state - torch.cuda.empty_cache() - - -def convert_pyslice_to_tensor(x: Any) -> torch.Tensor: - """convert PySafeSlice object from safetensors to torch.Tensor - - PySafeSlice object supports indexing, which is done before loading the - actual tensor and can reduce the amount of memory being read into the - memory. However, it does not support more advanced functionalities - like `.view()` or `.t()`. Therefore, if we need to modify the loaded - tensor with these more complicated operators, we need to convert to - tensor first. - """ - if not isinstance(x, torch.Tensor): - x = x[:] - return x - - -def default_weight_loader(param: torch.Tensor, - loaded_weight: torch.Tensor) -> None: - """Default weight loader.""" - assert param.size() == loaded_weight.size() - param.data.copy_(loaded_weight) - - -def initialize_dummy_weights( - model: torch.nn.Module, - low: float = -1e-3, - high: float = 1e-3, -) -> None: - """Initialize model weights with random values. - - The model weights must be randomly initialized for accurate performance - measurements. Additionally, the model weights should not cause NaNs in the - forward pass. We empirically found that initializing the weights with - values between -1e-3 and 1e-3 works well for most models. - """ - for param in model.state_dict().values(): - if torch.is_floating_point(param): - param.data.uniform_(low, high) diff --git a/vllm/outputs.py b/vllm/outputs.py deleted file mode 100644 index 534e9d5ea8a53f7af33c9560704e61b491c68872..0000000000000000000000000000000000000000 --- a/vllm/outputs.py +++ /dev/null @@ -1,133 +0,0 @@ -from typing import List, Optional - -from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup, - SequenceStatus) -from vllm.lora.request import LoRARequest - - -class CompletionOutput: - """The output data of one completion output of a request. - - Args: - index: The index of the output in the request. - text: The generated output text. - token_ids: The token IDs of the generated output text. - cumulative_logprob: The cumulative log probability of the generated - output text. - logprobs: The log probabilities of the top probability words at each - position if the logprobs are requested. - finish_reason: The reason why the sequence is finished. - lora_request: The LoRA request that was used to generate the output. - """ - - def __init__( - self, - index: int, - text: str, - token_ids: List[int], - cumulative_logprob: float, - logprobs: Optional[SampleLogprobs], - finish_reason: Optional[str] = None, - lora_request: Optional[LoRARequest] = None, - ) -> None: - self.index = index - self.text = text - self.token_ids = token_ids - self.cumulative_logprob = cumulative_logprob - self.logprobs = logprobs - self.finish_reason = finish_reason - self.lora_request = lora_request - - def finished(self) -> bool: - return self.finish_reason is not None - - def __repr__(self) -> str: - return (f"CompletionOutput(index={self.index}, " - f"text={self.text!r}, " - f"token_ids={self.token_ids}, " - f"cumulative_logprob={self.cumulative_logprob}, " - f"logprobs={self.logprobs}, " - f"finish_reason={self.finish_reason})") - - -class RequestOutput: - """The output data of a request to the LLM. - - Args: - request_id: The unique ID of the request. - prompt: The prompt string of the request. - prompt_token_ids: The token IDs of the prompt. - prompt_logprobs: The log probabilities to return per prompt token. - outputs: The output sequences of the request. - finished: Whether the whole request is finished. - lora_request: The LoRA request that was used to generate the output. - """ - - def __init__( - self, - request_id: str, - prompt: str, - prompt_token_ids: List[int], - prompt_logprobs: Optional[PromptLogprobs], - outputs: List[CompletionOutput], - finished: bool, - lora_request: Optional[LoRARequest] = None, - ) -> None: - self.request_id = request_id - self.prompt = prompt - self.prompt_token_ids = prompt_token_ids - self.prompt_logprobs = prompt_logprobs - self.outputs = outputs - self.finished = finished - self.lora_request = lora_request - - @classmethod - def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput": - # Get the top-n sequences. - n = seq_group.sampling_params.n - seqs = seq_group.get_seqs() - if seq_group.sampling_params.use_beam_search: - sorting_key = lambda seq: seq.get_beam_search_score( - seq_group.sampling_params.length_penalty) - else: - sorting_key = lambda seq: seq.get_cumulative_logprob() - sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) - top_n_seqs = sorted_seqs[:n] - - # Create the outputs. - outputs: List[CompletionOutput] = [] - for seq in top_n_seqs: - logprobs = seq.output_logprobs - if seq_group.sampling_params.logprobs is None: - # NOTE: We need to take care of this case because the sequence - # always has the logprobs of the sampled tokens even if the - # logprobs are not requested. - logprobs = None - finshed_reason = SequenceStatus.get_finished_reason(seq.status) - output = CompletionOutput(seqs.index(seq), seq.output_text, - seq.get_output_token_ids(), - seq.get_cumulative_logprob(), logprobs, - finshed_reason) - outputs.append(output) - - # Every sequence in the sequence group should have the same prompt. - prompt = seq_group.prompt - prompt_token_ids = seq_group.prompt_token_ids - prompt_logprobs = seq_group.prompt_logprobs - finished = seq_group.is_finished() - return cls(seq_group.request_id, - prompt, - prompt_token_ids, - prompt_logprobs, - outputs, - finished, - lora_request=seq_group.lora_request) - - def __repr__(self) -> str: - return (f"RequestOutput(request_id={self.request_id}, " - f"prompt={self.prompt!r}, " - f"prompt_token_ids={self.prompt_token_ids}, " - f"prompt_logprobs={self.prompt_logprobs}, " - f"outputs={self.outputs}, " - f"finished={self.finished}, " - f"lora_request={self.lora_request})") diff --git a/vllm/prefix.py b/vllm/prefix.py deleted file mode 100644 index 5b6e8e4b92be6a6418741fe10bfc8fd3a52bd7c8..0000000000000000000000000000000000000000 --- a/vllm/prefix.py +++ /dev/null @@ -1,87 +0,0 @@ -from typing import Dict, List, Sequence, Tuple, Optional - -from vllm.block import BlockTable - - -class Prefix: - """Data and states associated with a prefix of prompt tokens for multiple - sequence groups. - - NOTE: This feature is experimental and may be replaced with automatic - prefix caching in the future. - - Args: - token_ids: The token ids of the prefix. - block_size: The block size of the executed model. - """ - - def __init__( - self, - token_ids: Sequence[int], - block_size: int, - ) -> None: - self.token_ids = tuple(token_ids) - self.block_size = block_size - self.length = len(token_ids) - self.hash = hash(token_ids) - assert self.length % block_size == 0 - self.block_table: Optional[BlockTable] = None - self.computed = False - - @property - def allocated(self) -> bool: - return self.block_table is not None - - def get_num_blocks(self) -> int: - return self.length // self.block_size - - def get_block_numbers(self) -> List[int]: - return [block.block_number for block in self.block_table] - - def get_length(self) -> int: - return self.length - - def __hash__(self) -> int: - return self.hash - - def set_block_table(self, block_table: BlockTable) -> None: - self.block_table = block_table.copy() - - -class PrefixPool: - """Manages all the prompt prefixes. - - NOTE: This feature is experimental and may be replaced with automatic - prefix caching in the future. - - Args: - block_size: The block size of the executed model. - - Attributes: - prefixes: A list of all the prefixes. - block_size: The block size of the executed model. - """ - - def __init__( - self, - block_size: int, - ) -> None: - # TODO(zhuohan): Add a capacity limit to the prefix pool. - self.prefixes: Dict[int, Prefix] = {} - self.block_size = block_size - - def _truncate_token_ids(self, token_ids: Sequence[int]) -> Tuple[int]: - new_length = len(token_ids) // self.block_size * self.block_size - return tuple(token_ids[:new_length]) - - def add_or_get_prefix(self, token_ids: Sequence[int], - lora_int_id: int) -> Optional[Prefix]: - token_ids = self._truncate_token_ids(token_ids) - if len(token_ids) == 0: - # Prefix is empty. - return None - prefix = Prefix(token_ids, self.block_size) - prefix_hash = hash((prefix, lora_int_id)) - if prefix_hash not in self.prefixes: - self.prefixes[prefix_hash] = prefix - return self.prefixes[prefix_hash] diff --git a/vllm/py.typed b/vllm/py.typed deleted file mode 100644 index 33b3ad73cac6fcd1624ea060d44334b79da3ccd0..0000000000000000000000000000000000000000 --- a/vllm/py.typed +++ /dev/null @@ -1,2 +0,0 @@ -# Marker file for PEP 561. -# The vllm package uses inline types. diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py deleted file mode 100644 index bb7d0002c910ca5144d630d57864cfe4c2b3e03e..0000000000000000000000000000000000000000 --- a/vllm/sampling_params.py +++ /dev/null @@ -1,257 +0,0 @@ -"""Sampling parameters for text generation.""" -from enum import IntEnum -from functools import cached_property -from typing import Callable, List, Optional, Union - -import torch - -_SAMPLING_EPS = 1e-5 - - -class SamplingType(IntEnum): - GREEDY = 0 - RANDOM = 1 - BEAM = 2 - - -LogitsProcessor = Callable[[List[int], torch.Tensor], torch.Tensor] -"""LogitsProcessor is a function that takes a list of previously generated -tokens and a tensor of the logits for the next token, and returns a modified -tensor of logits to sample from.""" - - -class SamplingParams: - """Sampling parameters for text generation. - - Overall, we follow the sampling parameters from the OpenAI text completion - API (https://platform.openai.com/docs/api-reference/completions/create). - In addition, we support beam search, which is not supported by OpenAI. - - Args: - n: Number of output sequences to return for the given prompt. - best_of: Number of output sequences that are generated from the prompt. - From these `best_of` sequences, the top `n` sequences are returned. - `best_of` must be greater than or equal to `n`. This is treated as - the beam width when `use_beam_search` is True. By default, `best_of` - is set to `n`. - presence_penalty: Float that penalizes new tokens based on whether they - appear in the generated text so far. Values > 0 encourage the model - to use new tokens, while values < 0 encourage the model to repeat - tokens. - frequency_penalty: Float that penalizes new tokens based on their - frequency in the generated text so far. Values > 0 encourage the - model to use new tokens, while values < 0 encourage the model to - repeat tokens. - repetition_penalty: Float that penalizes new tokens based on whether - they appear in the prompt and the generated text so far. Values > 1 - encourage the model to use new tokens, while values < 1 encourage - the model to repeat tokens. - temperature: Float that controls the randomness of the sampling. Lower - values make the model more deterministic, while higher values make - the model more random. Zero means greedy sampling. - top_p: Float that controls the cumulative probability of the top tokens - to consider. Must be in (0, 1]. Set to 1 to consider all tokens. - top_k: Integer that controls the number of top tokens to consider. Set - to -1 to consider all tokens. - min_p: Float that represents the minimum probability for a token to be - considered, relative to the probability of the most likely token. - Must be in [0, 1]. Set to 0 to disable this. - use_beam_search: Whether to use beam search instead of sampling. - length_penalty: Float that penalizes sequences based on their length. - Used in beam search. - early_stopping: Controls the stopping condition for beam search. It - accepts the following values: `True`, where the generation stops as - soon as there are `best_of` complete candidates; `False`, where an - heuristic is applied and the generation stops when is it very - unlikely to find better candidates; `"never"`, where the beam search - procedure only stops when there cannot be better candidates - (canonical beam search algorithm). - stop: List of strings that stop the generation when they are generated. - The returned output will not contain the stop strings. - stop_token_ids: List of tokens that stop the generation when they are - generated. The returned output will contain the stop tokens unless - the stop tokens are special tokens. - include_stop_str_in_output: Whether to include the stop strings in output - text. Defaults to False. - ignore_eos: Whether to ignore the EOS token and continue generating - tokens after the EOS token is generated. - max_tokens: Maximum number of tokens to generate per output sequence. - logprobs: Number of log probabilities to return per output token. - Note that the implementation follows the OpenAI API: The return - result includes the log probabilities on the `logprobs` most likely - tokens, as well the chosen tokens. The API will always return the - log probability of the sampled token, so there may be up to - `logprobs+1` elements in the response. - prompt_logprobs: Number of log probabilities to return per prompt token. - skip_special_tokens: Whether to skip special tokens in the output. - spaces_between_special_tokens: Whether to add spaces between special - tokens in the output. Defaults to True. - logits_processors: List of functions that modify logits based on - previously generated tokens. - """ - - def __init__( - self, - n: int = 1, - best_of: Optional[int] = None, - presence_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repetition_penalty: float = 1.0, - temperature: float = 1.0, - top_p: float = 1.0, - top_k: int = -1, - min_p: float = 0.0, - use_beam_search: bool = False, - length_penalty: float = 1.0, - early_stopping: Union[bool, str] = False, - stop: Optional[Union[str, List[str]]] = None, - stop_token_ids: Optional[List[int]] = None, - include_stop_str_in_output: bool = False, - ignore_eos: bool = False, - max_tokens: Optional[int] = 16, - logprobs: Optional[int] = None, - prompt_logprobs: Optional[int] = None, - skip_special_tokens: bool = True, - spaces_between_special_tokens: bool = True, - logits_processors: Optional[List[LogitsProcessor]] = None, - ) -> None: - self.n = n - self.best_of = best_of if best_of is not None else n - self.presence_penalty = presence_penalty - self.frequency_penalty = frequency_penalty - self.repetition_penalty = repetition_penalty - self.temperature = temperature - self.top_p = top_p - self.top_k = top_k - self.min_p = min_p - self.use_beam_search = use_beam_search - self.length_penalty = length_penalty - self.early_stopping = early_stopping - if stop is None: - self.stop = [] - elif isinstance(stop, str): - self.stop = [stop] - else: - self.stop = list(stop) - if stop_token_ids is None: - self.stop_token_ids = [] - else: - self.stop_token_ids = list(stop_token_ids) - self.ignore_eos = ignore_eos - self.max_tokens = max_tokens - self.logprobs = logprobs - self.prompt_logprobs = prompt_logprobs - self.skip_special_tokens = skip_special_tokens - self.spaces_between_special_tokens = spaces_between_special_tokens - self.logits_processors = logits_processors - self.include_stop_str_in_output = include_stop_str_in_output - self._verify_args() - if self.use_beam_search: - self._verify_beam_search() - else: - self._verify_non_beam_search() - if self.temperature < _SAMPLING_EPS: - # Zero temperature means greedy sampling. - self.top_p = 1.0 - self.top_k = -1 - self.min_p = 0.0 - self._verify_greedy_sampling() - - def _verify_args(self) -> None: - if self.n < 1: - raise ValueError(f"n must be at least 1, got {self.n}.") - if self.best_of < self.n: - raise ValueError(f"best_of must be greater than or equal to n, " - f"got n={self.n} and best_of={self.best_of}.") - if not -2.0 <= self.presence_penalty <= 2.0: - raise ValueError("presence_penalty must be in [-2, 2], got " - f"{self.presence_penalty}.") - if not -2.0 <= self.frequency_penalty <= 2.0: - raise ValueError("frequency_penalty must be in [-2, 2], got " - f"{self.frequency_penalty}.") - if not 0.0 < self.repetition_penalty <= 2.0: - raise ValueError("repetition_penalty must be in (0, 2], got " - f"{self.repetition_penalty}.") - if self.temperature < 0.0: - raise ValueError( - f"temperature must be non-negative, got {self.temperature}.") - if not 0.0 < self.top_p <= 1.0: - raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.") - if self.top_k < -1 or self.top_k == 0: - raise ValueError(f"top_k must be -1 (disable), or at least 1, " - f"got {self.top_k}.") - if not 0.0 <= self.min_p <= 1.0: - raise ValueError("min_p must be in [0, 1], got " - f"{self.min_p}.") - if self.max_tokens is not None and self.max_tokens < 1: - raise ValueError( - f"max_tokens must be at least 1, got {self.max_tokens}.") - if self.logprobs is not None and self.logprobs < 0: - raise ValueError( - f"logprobs must be non-negative, got {self.logprobs}.") - if self.prompt_logprobs is not None and self.prompt_logprobs < 0: - raise ValueError(f"prompt_logprobs must be non-negative, got " - f"{self.prompt_logprobs}.") - - def _verify_beam_search(self) -> None: - if self.best_of == 1: - raise ValueError("best_of must be greater than 1 when using beam " - f"search. Got {self.best_of}.") - if self.temperature > _SAMPLING_EPS: - raise ValueError("temperature must be 0 when using beam search.") - if self.top_p < 1.0 - _SAMPLING_EPS: - raise ValueError("top_p must be 1 when using beam search.") - if self.top_k != -1: - raise ValueError("top_k must be -1 when using beam search.") - if self.early_stopping not in [True, False, "never"]: - raise ValueError( - f"early_stopping must be True, False, or 'never', " - f"got {self.early_stopping}.") - - def _verify_non_beam_search(self) -> None: - if self.early_stopping is not False: - raise ValueError("early_stopping is not effective and must be " - "False when not using beam search.") - if (self.length_penalty < 1.0 - _SAMPLING_EPS - or self.length_penalty > 1.0 + _SAMPLING_EPS): - raise ValueError( - "length_penalty is not effective and must be the " - "default value of 1.0 when not using beam search.") - - def _verify_greedy_sampling(self) -> None: - if self.best_of > 1: - raise ValueError("best_of must be 1 when using greedy sampling." - f"Got {self.best_of}.") - - @cached_property - def sampling_type(self) -> SamplingType: - if self.use_beam_search: - return SamplingType.BEAM - if self.temperature < _SAMPLING_EPS: - return SamplingType.GREEDY - return SamplingType.RANDOM - - def __repr__(self) -> str: - return ( - f"SamplingParams(n={self.n}, " - f"best_of={self.best_of}, " - f"presence_penalty={self.presence_penalty}, " - f"frequency_penalty={self.frequency_penalty}, " - f"repetition_penalty={self.repetition_penalty}, " - f"temperature={self.temperature}, " - f"top_p={self.top_p}, " - f"top_k={self.top_k}, " - f"min_p={self.min_p}, " - f"use_beam_search={self.use_beam_search}, " - f"length_penalty={self.length_penalty}, " - f"early_stopping={self.early_stopping}, " - f"stop={self.stop}, " - f"stop_token_ids={self.stop_token_ids}, " - f"include_stop_str_in_output={self.include_stop_str_in_output}, " - f"ignore_eos={self.ignore_eos}, " - f"max_tokens={self.max_tokens}, " - f"logprobs={self.logprobs}, " - f"prompt_logprobs={self.prompt_logprobs}, " - f"skip_special_tokens={self.skip_special_tokens}, " - "spaces_between_special_tokens=" - f"{self.spaces_between_special_tokens})") diff --git a/vllm/sequence.py b/vllm/sequence.py deleted file mode 100644 index d28627f47498ff05bf2288ecf944b5d20c143a4e..0000000000000000000000000000000000000000 --- a/vllm/sequence.py +++ /dev/null @@ -1,441 +0,0 @@ -"""Sequence and its related classes.""" -import copy -import enum -from typing import Dict, List, Optional, Union - -from vllm.block import LogicalTokenBlock -from vllm.prefix import Prefix -from vllm.sampling_params import SamplingParams -from vllm.lora.request import LoRARequest - -PromptLogprobs = List[Optional[Dict[int, float]]] -SampleLogprobs = List[Dict[int, float]] - - -class SequenceStatus(enum.Enum): - """Status of a sequence.""" - WAITING = enum.auto() - RUNNING = enum.auto() - SWAPPED = enum.auto() - FINISHED_STOPPED = enum.auto() - FINISHED_LENGTH_CAPPED = enum.auto() - FINISHED_ABORTED = enum.auto() - FINISHED_IGNORED = enum.auto() - - @staticmethod - def is_finished(status: "SequenceStatus") -> bool: - return status in [ - SequenceStatus.FINISHED_STOPPED, - SequenceStatus.FINISHED_LENGTH_CAPPED, - SequenceStatus.FINISHED_ABORTED, - SequenceStatus.FINISHED_IGNORED, - ] - - @staticmethod - def get_finished_reason(status: "SequenceStatus") -> Union[str, None]: - if status == SequenceStatus.FINISHED_STOPPED: - finish_reason = "stop" - elif status == SequenceStatus.FINISHED_LENGTH_CAPPED: - finish_reason = "length" - elif status == SequenceStatus.FINISHED_ABORTED: - finish_reason = "abort" - elif status == SequenceStatus.FINISHED_IGNORED: - # The ignored sequences are the sequences whose prompt lengths - # are longer than the model's length cap. Therefore, the stop - # reason should also be "length" as in OpenAI API. - finish_reason = "length" - else: - finish_reason = None - return finish_reason - - -class SequenceData: - """Data associated with a sequence. - - - Args: - prompt_token_ids: The token IDs of the prompt. - - Attributes: - prompt_token_ids: The token IDs of the prompt. - output_token_ids: The token IDs of the output. - cumulative_logprob: The cumulative log probability of the output. - """ - - def __init__( - self, - prompt_token_ids: List[int], - ) -> None: - self.prompt_token_ids = prompt_token_ids - self.output_token_ids: List[int] = [] - self.cumulative_logprob = 0.0 - - def append_token_id(self, token_id: int, logprob: float) -> None: - self.output_token_ids.append(token_id) - self.cumulative_logprob += logprob - - def get_len(self) -> int: - return len(self.output_token_ids) + len(self.prompt_token_ids) - - def get_prompt_len(self) -> int: - return len(self.prompt_token_ids) - - def get_output_len(self) -> int: - return len(self.output_token_ids) - - def get_token_ids(self) -> List[int]: - return self.prompt_token_ids + self.output_token_ids - - def get_last_token_id(self) -> int: - if not self.output_token_ids: - return self.prompt_token_ids[-1] - return self.output_token_ids[-1] - - def __repr__(self) -> str: - return (f"SequenceData(" - f"prompt_token_ids={self.prompt_token_ids}, " - f"output_token_ids={self.output_token_ids}, " - f"cumulative_logprob={self.cumulative_logprob})") - - -class Sequence: - """Stores the data, status, and block information of a sequence. - - Args: - seq_id: The ID of the sequence. - prompt: The prompt of the sequence. - prompt_token_ids: The token IDs of the prompt. - block_size: The block size of the sequence. Should be the same as the - block size used by the block manager and cache engine. - lora_request: LoRA request. - """ - - def __init__( - self, - seq_id: int, - prompt: str, - prompt_token_ids: List[int], - block_size: int, - lora_request: Optional[LoRARequest] = None, - ) -> None: - self.seq_id = seq_id - self.prompt = prompt - self.block_size = block_size - self.lora_request = lora_request - - self.data = SequenceData(prompt_token_ids) - self.output_logprobs: SampleLogprobs = [] - self.output_text = "" - - self.logical_token_blocks: List[LogicalTokenBlock] = [] - # Initialize the logical token blocks with the prompt token ids. - self._append_tokens_to_blocks(prompt_token_ids) - self.status = SequenceStatus.WAITING - - # Used for incremental detokenization - self.prefix_offset = 0 - self.read_offset = 0 - # Input + output tokens - self.tokens: Optional[List[str]] = None - - @property - def lora_int_id(self) -> int: - return self.lora_request.lora_int_id if self.lora_request else 0 - - def _append_logical_block(self) -> None: - block = LogicalTokenBlock( - block_number=len(self.logical_token_blocks), - block_size=self.block_size, - ) - self.logical_token_blocks.append(block) - - def _append_tokens_to_blocks(self, token_ids: List[int]) -> None: - cursor = 0 - while cursor < len(token_ids): - if not self.logical_token_blocks: - self._append_logical_block() - - last_block = self.logical_token_blocks[-1] - if last_block.is_full(): - self._append_logical_block() - last_block = self.logical_token_blocks[-1] - - num_empty_slots = last_block.get_num_empty_slots() - last_block.append_tokens(token_ids[cursor:cursor + - num_empty_slots]) - cursor += num_empty_slots - - def append_token_id( - self, - token_id: int, - logprobs: Dict[int, float], - ) -> None: - assert token_id in logprobs - self._append_tokens_to_blocks([token_id]) - self.output_logprobs.append(logprobs) - self.data.append_token_id(token_id, logprobs[token_id]) - - def get_len(self) -> int: - return self.data.get_len() - - def get_prompt_len(self) -> int: - return self.data.get_prompt_len() - - def get_output_len(self) -> int: - return self.data.get_output_len() - - def get_token_ids(self) -> List[int]: - return self.data.get_token_ids() - - def get_last_token_id(self) -> int: - return self.data.get_last_token_id() - - def get_output_token_ids(self) -> List[int]: - return self.data.output_token_ids - - def get_cumulative_logprob(self) -> float: - return self.data.cumulative_logprob - - def get_beam_search_score(self, - length_penalty: float = 0.0, - seq_len: Optional[int] = None, - eos_token_id: Optional[int] = None) -> float: - """Calculate the beam search score with length penalty. - - Adapted from - - https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938 - """ - if seq_len is None: - seq_len = self.get_len() - # NOTE: HF implementation does not count the EOS token - # towards the length, we align with that here for testing. - if (eos_token_id is not None - and self.get_last_token_id() == eos_token_id): - seq_len -= 1 - return self.get_cumulative_logprob() / (seq_len**length_penalty) - - def is_finished(self) -> bool: - return SequenceStatus.is_finished(self.status) - - def fork(self, new_seq_id: int) -> "Sequence": - new_seq = copy.deepcopy(self) - new_seq.seq_id = new_seq_id - return new_seq - - def __repr__(self) -> str: - return (f"Sequence(seq_id={self.seq_id}, " - f"status={self.status.name}, " - f"num_blocks={len(self.logical_token_blocks)})") - - -class SequenceGroup: - """A group of sequences that are generated from the same prompt. - - Args: - request_id: The ID of the request. - seqs: The list of sequences. - sampling_params: The sampling parameters used to generate the outputs. - arrival_time: The arrival time of the request. - lora_request: LoRA request. - prefix: The prefix of the prompt of the sequence group. - """ - - def __init__( - self, - request_id: str, - seqs: List[Sequence], - sampling_params: SamplingParams, - arrival_time: float, - lora_request: Optional[LoRARequest] = None, - prefix: Optional[Prefix] = None, - ) -> None: - self.request_id = request_id - self.seqs_dict = {seq.seq_id: seq for seq in seqs} - self.sampling_params = sampling_params - self.arrival_time = arrival_time - self.lora_request = lora_request - self.prefix: Optional[Prefix] = prefix - self.prompt_logprobs: Optional[PromptLogprobs] = None - - @property - def prompt(self) -> str: - # All sequences in the group should have the same prompt. - # We use the prompt of an arbitrary sequence. - return next(iter(self.seqs_dict.values())).prompt - - @property - def prompt_token_ids(self) -> List[int]: - # All sequences in the group should have the same prompt. - # We use the prompt of an arbitrary sequence. - return next(iter(self.seqs_dict.values())).data.prompt_token_ids - - @property - def lora_int_id(self) -> int: - return self.lora_request.lora_int_id if self.lora_request else 0 - - def get_max_num_running_seqs(self) -> int: - """The maximum number of sequences running in parallel in the remaining - lifetime of the request.""" - if self.sampling_params.use_beam_search: - # For beam search, maximally there will always be `best_of` beam - # candidates running in the future. - return self.sampling_params.best_of - else: - if self.sampling_params.best_of > self.num_seqs(): - # At prompt stage, the sequence group is not yet filled up - # and only have one sequence running. However, in the - # generation stage, we will have `best_of` sequences running. - return self.sampling_params.best_of - # At sampling stages, return the number of actual sequences - # that are not finished yet. - return self.num_unfinished_seqs() - - def get_seqs( - self, - status: Optional[SequenceStatus] = None, - ) -> List[Sequence]: - if status is None: - return list(self.seqs_dict.values()) - else: - return [ - seq for seq in self.seqs_dict.values() if seq.status == status - ] - - def get_unfinished_seqs(self) -> List[Sequence]: - return [ - seq for seq in self.seqs_dict.values() if not seq.is_finished() - ] - - def get_finished_seqs(self) -> List[Sequence]: - return [seq for seq in self.seqs_dict.values() if seq.is_finished()] - - def num_seqs(self, status: Optional[SequenceStatus] = None) -> int: - return len(self.get_seqs(status)) - - def num_unfinished_seqs(self) -> int: - return len(self.get_unfinished_seqs()) - - def num_finished_seqs(self) -> int: - return len(self.get_finished_seqs()) - - def find(self, seq_id: int) -> Sequence: - if seq_id not in self.seqs_dict: - raise ValueError(f"Sequence {seq_id} not found.") - return self.seqs_dict[seq_id] - - def add(self, seq: Sequence) -> None: - if seq.seq_id in self.seqs_dict: - raise ValueError(f"Sequence {seq.seq_id} already exists.") - self.seqs_dict[seq.seq_id] = seq - - def remove(self, seq_id: int) -> None: - if seq_id not in self.seqs_dict: - raise ValueError(f"Sequence {seq_id} not found.") - del self.seqs_dict[seq_id] - - def is_finished(self) -> bool: - return all(seq.is_finished() for seq in self.get_seqs()) - - def __repr__(self) -> str: - return (f"SequenceGroup(request_id={self.request_id}, " - f"sampling_params={self.sampling_params}, " - f"num_seqs={len(self.seqs_dict)})") - - -class SequenceGroupMetadata: - """Metadata for a sequence group. Used to create `InputMetadata`. - - Args: - request_id: The ID of the request. - is_prompt: Whether the request is at prompt stage. - seq_data: The sequence data. (Seq id -> sequence data) - sampling_params: The sampling parameters used to generate the outputs. - block_tables: The block tables. (Seq id -> list of physical block - numbers) - lora_request: LoRA request. - prefix: The prefix of the prompt of the sequence group. - """ - - def __init__( - self, - request_id: str, - is_prompt: bool, - seq_data: Dict[int, SequenceData], - sampling_params: SamplingParams, - block_tables: Dict[int, List[int]], - lora_request: Optional[LoRARequest] = None, - prefix: Optional[Prefix] = None, - ) -> None: - self.request_id = request_id - self.is_prompt = is_prompt - self.seq_data = seq_data - self.sampling_params = sampling_params - self.block_tables = block_tables - self.lora_request = lora_request - self.prefix = prefix - - @property - def lora_int_id(self) -> int: - return self.lora_request.lora_int_id if self.lora_request else 0 - - -class SequenceOutput: - """The model output associated with a sequence. - - Args: - parent_seq_id: The ID of the parent sequence (for forking in beam - search). - output_token: The output token ID. - logprobs: The logprobs of the output token. - (Token id -> logP(x_i+1 | x_0, ..., x_i)) - """ - - def __init__( - self, - parent_seq_id: int, - output_token: int, - logprobs: Dict[int, float], - ) -> None: - self.parent_seq_id = parent_seq_id - self.output_token = output_token - self.logprobs = logprobs - - def __repr__(self) -> str: - return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, " - f"output_token={self.output_token}, " - f"logprobs={self.logprobs})") - - def __eq__(self, other: object) -> bool: - if not isinstance(other, SequenceOutput): - raise NotImplementedError() - return (self.parent_seq_id == other.parent_seq_id - and self.output_token == other.output_token - and self.logprobs == other.logprobs) - - -class SequenceGroupOutput: - """The model output associated with a sequence group.""" - - def __init__( - self, - samples: List[SequenceOutput], - prompt_logprobs: Optional[PromptLogprobs], - ) -> None: - self.samples = samples - self.prompt_logprobs = prompt_logprobs - - def __repr__(self) -> str: - return (f"SequenceGroupOutput(samples={self.samples}, " - f"prompt_logprobs={self.prompt_logprobs})") - - def __eq__(self, other: object) -> bool: - if not isinstance(other, SequenceGroupOutput): - raise NotImplementedError() - return (self.samples == other.samples - and self.prompt_logprobs == other.prompt_logprobs) - - -# For each sequence group, we generate a list of SequenceOutput object, -# each of which contains one possible candidate for the next token. -SamplerOutput = List[SequenceGroupOutput] diff --git a/vllm/test_utils.py b/vllm/test_utils.py deleted file mode 100644 index 4f74c05038e70926e2e3a6329baf0cd387fec909..0000000000000000000000000000000000000000 --- a/vllm/test_utils.py +++ /dev/null @@ -1,38 +0,0 @@ -import ray - -from vllm.config import ParallelConfig -from vllm.utils import get_open_port -from vllm.worker.worker import init_distributed_environment - - -def init_test_distributed_environment( - pipeline_parallel_size: int, - tensor_parallel_size: int, - rank: int, - distributed_init_port: str, -) -> None: - parallel_config = ParallelConfig(pipeline_parallel_size, - tensor_parallel_size, - worker_use_ray=True) - distributed_init_method = f"tcp://localhost:{distributed_init_port}" - init_distributed_environment(parallel_config, rank, - distributed_init_method) - - -def multi_process_tensor_parallel( - tensor_parallel_size: int, - test_target, -) -> None: - # Using ray helps debugging the error when it failed - # as compared to multiprocessing. - ray.init() - - distributed_init_port = get_open_port() - refs = [] - for rank in range(tensor_parallel_size): - refs.append( - test_target.remote(tensor_parallel_size, rank, - distributed_init_port)) - ray.get(refs) - - ray.shutdown() diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py deleted file mode 100644 index 8b16e559b24f29b2dc18dc5f0e81b04db4378c43..0000000000000000000000000000000000000000 --- a/vllm/transformers_utils/config.py +++ /dev/null @@ -1,39 +0,0 @@ -from typing import Optional - -from transformers import AutoConfig, PretrainedConfig - -from vllm.transformers_utils.configs import * - -_CONFIG_REGISTRY = { - "aquila": AquilaConfig, - "baichuan": BaiChuanConfig, - "chatglm": ChatGLMConfig, - "mpt": MPTConfig, - "qwen": QWenConfig, - "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) - "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) - "yi": YiConfig, -} - - -def get_config(model: str, - trust_remote_code: bool, - revision: Optional[str] = None) -> PretrainedConfig: - try: - config = AutoConfig.from_pretrained( - model, trust_remote_code=trust_remote_code, revision=revision) - except ValueError as e: - if (not trust_remote_code and - "requires you to execute the configuration file" in str(e)): - err_msg = ( - "Failed to load the model config. If the model is a custom " - "model not yet available in the HuggingFace transformers " - "library, consider setting `trust_remote_code=True` in LLM " - "or using the `--trust-remote-code` flag in the CLI.") - raise RuntimeError(err_msg) from e - else: - raise e - if config.model_type in _CONFIG_REGISTRY: - config_class = _CONFIG_REGISTRY[config.model_type] - config = config_class.from_pretrained(model, revision=revision) - return config diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py deleted file mode 100644 index 284867414e0ed512ee691c4b39c31e0980114e71..0000000000000000000000000000000000000000 --- a/vllm/transformers_utils/configs/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from vllm.transformers_utils.configs.aquila import AquilaConfig -from vllm.transformers_utils.configs.baichuan import BaiChuanConfig -from vllm.transformers_utils.configs.chatglm import ChatGLMConfig -from vllm.transformers_utils.configs.mpt import MPTConfig -from vllm.transformers_utils.configs.qwen import QWenConfig -# RWConfig is for the original tiiuae/falcon-40b(-instruct) and -# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the -# `FalconConfig` class from the official HuggingFace transformers library. -from vllm.transformers_utils.configs.falcon import RWConfig -from vllm.transformers_utils.configs.yi import YiConfig - -__all__ = [ - "AquilaConfig", - "BaiChuanConfig", - "ChatGLMConfig", - "MPTConfig", - "QWenConfig", - "RWConfig", - "YiConfig", -] diff --git a/vllm/transformers_utils/configs/aquila.py b/vllm/transformers_utils/configs/aquila.py deleted file mode 100644 index 86a6f2ba304af17130b7874d1c2c8d673ab4cccd..0000000000000000000000000000000000000000 --- a/vllm/transformers_utils/configs/aquila.py +++ /dev/null @@ -1,69 +0,0 @@ -# coding=utf-8 -# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Aquila model configuration""" - -from transformers import PretrainedConfig - - -class AquilaConfig(PretrainedConfig): - model_type = "aquila" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=100008, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.006, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) diff --git a/vllm/transformers_utils/configs/baichuan.py b/vllm/transformers_utils/configs/baichuan.py deleted file mode 100644 index 869817525c11af4e15403cdce0e40206b90a5bd6..0000000000000000000000000000000000000000 --- a/vllm/transformers_utils/configs/baichuan.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding=utf-8 -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from transformers.configuration_utils import PretrainedConfig - - -class BaiChuanConfig(PretrainedConfig): - model_type = "baichuan" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=64000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - hidden_act="silu", - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py deleted file mode 100644 index c4244f8c77f44d912b154d01646f321b607f1988..0000000000000000000000000000000000000000 --- a/vllm/transformers_utils/configs/chatglm.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/THUDM/ChatGLM2-6B -from transformers import PretrainedConfig - - -class ChatGLMConfig(PretrainedConfig): - model_type = "chatglm" - attribute_map = { - "num_hidden_layers": "num_layers", - "n_head_kv": "multi_query_group_num", - } - - def __init__(self, - num_layers=28, - padded_vocab_size=65024, - hidden_size=4096, - ffn_hidden_size=13696, - kv_channels=128, - num_attention_heads=32, - seq_length=2048, - hidden_dropout=0.0, - attention_dropout=0.0, - layernorm_epsilon=1e-5, - rmsnorm=True, - apply_residual_connection_post_layernorm=False, - post_layer_norm=True, - add_bias_linear=False, - add_qkv_bias=False, - interleaved_qkv=False, - bias_dropout_fusion=True, - multi_query_attention=False, - multi_query_group_num=1, - apply_query_key_layer_scaling=True, - attention_softmax_in_fp32=True, - fp32_residual_connection=False, - quantization_bit=0, - pre_seq_len=None, - prefix_projection=False, - **kwargs): - self.num_layers = num_layers - self.vocab_size = padded_vocab_size - self.padded_vocab_size = padded_vocab_size - self.hidden_size = hidden_size - self.ffn_hidden_size = ffn_hidden_size - self.kv_channels = kv_channels - self.num_attention_heads = num_attention_heads - self.seq_length = seq_length - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - self.layernorm_epsilon = layernorm_epsilon - self.rmsnorm = rmsnorm - self.apply_residual_connection_post_layernorm = ( - apply_residual_connection_post_layernorm) - self.post_layer_norm = post_layer_norm - self.add_bias_linear = add_bias_linear - self.add_qkv_bias = add_qkv_bias - self.bias_dropout_fusion = bias_dropout_fusion - self.multi_query_attention = multi_query_attention - self.multi_query_group_num = multi_query_group_num - self.apply_query_key_layer_scaling = apply_query_key_layer_scaling - self.attention_softmax_in_fp32 = attention_softmax_in_fp32 - self.fp32_residual_connection = fp32_residual_connection - self.quantization_bit = quantization_bit - self.pre_seq_len = pre_seq_len - self.prefix_projection = prefix_projection - self.interleaved_qkv = interleaved_qkv - super().__init__(**kwargs) diff --git a/vllm/transformers_utils/configs/falcon.py b/vllm/transformers_utils/configs/falcon.py deleted file mode 100644 index c82cc6065c7eab9d8f7747fc1764f612a78ed6c2..0000000000000000000000000000000000000000 --- a/vllm/transformers_utils/configs/falcon.py +++ /dev/null @@ -1,87 +0,0 @@ -# Adapted from -# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py -# Copyright 2023 The vLLM team. -# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Falcon configuration""" -from transformers.configuration_utils import PretrainedConfig - - -class RWConfig(PretrainedConfig): - model_type = "falcon" - keys_to_ignore_at_inference = ["past_key_values"] - attribute_map = { - "num_hidden_layers": "n_layer", - "num_attention_heads": "n_head", - "num_kv_heads": "n_head_kv", - } - - def __init__( - self, - vocab_size=250880, - hidden_size=64, - n_layer=2, - n_head=8, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - use_cache=True, - bos_token_id=1, - eos_token_id=2, - hidden_dropout=0.0, - attention_dropout=0.0, - multi_query=True, - n_head_kv=None, - alibi=False, - bias=False, - parallel_attn=False, - new_decoder_architecture=False, - **kwargs, - ) -> None: - self.vocab_size = vocab_size - # Backward compatibility with n_embed kwarg - n_embed = kwargs.pop("n_embed", None) - self.hidden_size = hidden_size if n_embed is None else n_embed - self.n_layer = n_layer - self.n_head = n_head - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.use_cache = use_cache - self.hidden_dropout = hidden_dropout - self.attention_dropout = attention_dropout - - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id - self.multi_query = multi_query - self.n_head_kv = 1 if n_head_kv is None else n_head_kv - self.alibi = alibi - self.bias = bias - self.parallel_attn = parallel_attn - self.new_decoder_architecture = new_decoder_architecture - - if self.hidden_size == 8192: - # Hack for falcon-40b - self.new_decoder_architecture = True - - super().__init__(bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs) - - @property - def head_dim(self): - return self.hidden_size // self.n_head - - @property - def rotary(self): - return not self.alibi diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py deleted file mode 100644 index 5ea0d9122ef11ee6bb965fa9c4f0d7c3ac9d718c..0000000000000000000000000000000000000000 --- a/vllm/transformers_utils/configs/mpt.py +++ /dev/null @@ -1,232 +0,0 @@ -# coding=utf-8 -# Copied from -# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py -"""A HuggingFace-style model configuration.""" -import warnings -from typing import Any, Dict, Optional, Union -from transformers import PretrainedConfig - -attn_config_defaults: Dict = { - 'attn_type': 'multihead_attention', - 'attn_pdrop': 0.0, - 'attn_impl': 'triton', - 'qk_ln': False, - 'clip_qkv': None, - 'softmax_scale': None, - 'prefix_lm': False, - 'attn_uses_sequence_id': False, - 'alibi': False, - 'alibi_bias_max': 8 -} -ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'} -init_config_defaults: Dict = { - 'name': 'kaiming_normal_', - 'fan_mode': 'fan_in', - 'init_nonlinearity': 'relu', - 'init_div_is_residual': True, - 'emb_init_std': None, - 'emb_init_uniform_lim': None, - 'init_std': None, - 'init_gain': 0.0 -} - - -class MPTConfig(PretrainedConfig): - model_type = 'mpt' - attribute_map = { - 'num_attention_heads': 'n_heads', - 'hidden_size': 'd_model', - 'num_hidden_layers': 'n_layers', - } - - # pylint: disable=dangerous-default-value - def __init__(self, - d_model: int = 2048, - n_heads: int = 16, - n_layers: int = 24, - expansion_ratio: int = 4, - max_seq_len: int = 2048, - vocab_size: int = 50368, - resid_pdrop: float = 0.0, - emb_pdrop: float = 0.0, - learned_pos_emb: bool = True, - attn_config: Dict = attn_config_defaults, - ffn_config: Dict = ffn_config_defaults, - init_device: str = 'cpu', - logit_scale: Optional[Union[float, str]] = None, - no_bias: bool = False, - embedding_fraction: float = 1.0, - norm_type: str = 'low_precision_layernorm', - use_cache: bool = False, - init_config: Dict = init_config_defaults, - fc_type: str = 'torch', - verbose: Optional[int] = None, - **kwargs: Any): - """The MPT configuration class. - Args: - d_model (int): The size of the embedding dimension of the model. - n_heads (int): The number of attention heads. - n_layers (int): The number of layers in the model. - expansion_ratio (int): The ratio of the up/down scale in the ffn. - max_seq_len (int): The maximum sequence length of the model. - vocab_size (int): The size of the vocabulary. - resid_pdrop (float): The dropout probability applied to the attention output before combining with residual. - emb_pdrop (float): The dropout probability for the embedding layer. - learned_pos_emb (bool): Whether to use learned positional embeddings - attn_config (Dict): A dictionary used to configure the model's attention module: - attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention - attn_pdrop (float): The dropout probability for the attention layers. - attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'. - qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer. - clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to - this value. - softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None, - use the default scale of ``1/sqrt(d_keys)``. - prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an - extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix - can attend to one another bi-directionally. Tokens outside the prefix use causal attention. - attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id. - When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates - which sub-sequence each token belongs to. - Defaults to ``False`` meaning any provided `sequence_id` will be ignored. - alibi (bool): Whether to use the alibi bias instead of position embeddings. - alibi_bias_max (int): The maximum value of the alibi bias. - kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads. - ffn_config (Dict): A dictionary used to configure the model's ffn module: - ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp - init_device (str): The device to use for parameter initialization. - logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value. - no_bias (bool): Whether to use bias in all layers. - verbose (int): The verbosity level. 0 is silent. - embedding_fraction (float): The fraction to scale the gradients of the embedding layer by. - norm_type (str): choose type of norm to use - use_cache (bool): Whether or not the model should return the last key/values attentions - init_config (Dict): A dictionary used to configure the model initialization: - init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_', - 'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or - 'xavier_normal_'. These mimic the parameter initialization methods in PyTorch. - init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True. - emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer. - emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution - used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``. - init_std (float): The standard deviation of the normal distribution used to initialize the model, - if using the baseline_ parameter initialization scheme. - init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes. - fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes. - init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes. - --- - See llmfoundry.models.utils.param_init_fns.py for info on other param init config options - fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs. - """ - self.d_model = d_model - self.n_heads = n_heads - self.n_layers = n_layers - self.expansion_ratio = expansion_ratio - self.max_seq_len = max_seq_len - self.vocab_size = vocab_size - self.resid_pdrop = resid_pdrop - self.emb_pdrop = emb_pdrop - self.learned_pos_emb = learned_pos_emb - self.attn_config = attn_config - self.ffn_config = ffn_config - self.init_device = init_device - self.logit_scale = logit_scale - self.no_bias = no_bias - self.embedding_fraction = embedding_fraction - self.norm_type = norm_type - self.use_cache = use_cache - self.init_config = init_config - self.fc_type = fc_type - if verbose is not None: - warnings.warn(DeprecationWarning( - 'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.' - ), - stacklevel=2) - if 'name' in kwargs: - del kwargs['name'] - if 'loss_fn' in kwargs: - del kwargs['loss_fn'] - if self.attn_config.get('alibi', False): - self.learned_pos_emb = False - warnings.warn( - f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`', - stacklevel=2) - super().__init__(**kwargs) - self._validate_config() - - def _set_config_defaults( - self, config: Dict[str, Any], - config_defaults: Dict[str, Any]) -> Dict[str, Any]: - for (k, v) in config_defaults.items(): - if k not in config: - config[k] = v - return config - - def _validate_config(self) -> None: - self.attn_config = self._set_config_defaults(self.attn_config, - attn_config_defaults) - self.ffn_config = self._set_config_defaults(self.ffn_config, - ffn_config_defaults) - self.init_config = self._set_config_defaults(self.init_config, - init_config_defaults) - if self.d_model % self.n_heads != 0: - raise ValueError('d_model must be divisible by n_heads') - if any(( - prob < 0 or prob > 1 for prob in - [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop] - )): - raise ValueError( - "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1" # pylint: disable=line-too-long - ) - if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']: - raise ValueError( - f"Unknown attn_impl={self.attn_config['attn_impl']}") - if self.attn_config['prefix_lm'] and self.attn_config[ - 'attn_impl'] not in ['torch', 'triton']: - raise NotImplementedError( - 'prefix_lm only implemented with torch and triton attention.') - if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [ - 'torch', 'triton' - ]: - raise NotImplementedError( - 'alibi only implemented with torch and triton attention.') - if self.attn_config['attn_uses_sequence_id'] and self.attn_config[ - 'attn_impl'] not in ['torch', 'triton']: - raise NotImplementedError( - 'attn_uses_sequence_id only implemented with torch and triton attention.' # pylint: disable=line-too-long - ) - if self.embedding_fraction > 1 or self.embedding_fraction <= 0: - raise ValueError( - 'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!' # pylint: disable=line-too-long - ) - if isinstance(self.logit_scale, - str) and self.logit_scale != 'inv_sqrt_d_model': - raise ValueError( - f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'." # pylint: disable=line-too-long - ) - if self.init_config.get('name', None) is None: - raise ValueError( - f"self.init_config={self.init_config!r} 'name' needs to be set." - ) - if not self.learned_pos_emb and (not self.attn_config['alibi']): - warnings.warn( - 'Positional information not being provided to the model.', - stacklevel=2) - if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp': - try: - # pylint: disable=import-outside-toplevel - import transformer_engine.pytorch as te - del te - except Exception as exc: - raise ImportError( - # pylint: disable=line-too-long - 'TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. ' - + - 'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n' - + 'pip install flash-attn==1.0.6 --no-build-isolation \n' + - 'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156' - ) from exc - if self.ffn_config['ffn_type'] == 'mptmlp': - self.ffn_config['fc_type'] = self.fc_type - elif self.ffn_config['ffn_type'] == 'te_ln_mlp': - self.ffn_config['bias'] = not self.no_bias diff --git a/vllm/transformers_utils/configs/qwen.py b/vllm/transformers_utils/configs/qwen.py deleted file mode 100644 index bb033a337ad0468898138340e7ac7f58419ac286..0000000000000000000000000000000000000000 --- a/vllm/transformers_utils/configs/qwen.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) Alibaba Cloud. -# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE - -from transformers import PretrainedConfig - - -class QWenConfig(PretrainedConfig): - model_type = "qwen" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=151936, - hidden_size=4096, - num_hidden_layers=32, - num_attention_heads=32, - emb_dropout_prob=0.0, - attn_dropout_prob=0.0, - layer_norm_epsilon=1e-6, - initializer_range=0.02, - max_position_embeddings=8192, - scale_attn_weights=True, - use_cache=True, - bf16=False, - fp16=False, - fp32=False, - kv_channels=128, - rotary_pct=1.0, - rotary_emb_base=10000, - use_dynamic_ntk=True, - use_logn_attn=True, - use_flash_attn="auto", - intermediate_size=22016, - no_bias=True, - tie_word_embeddings=False, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.emb_dropout_prob = emb_dropout_prob - self.attn_dropout_prob = attn_dropout_prob - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.scale_attn_weights = scale_attn_weights - self.use_cache = use_cache - self.max_position_embeddings = max_position_embeddings - self.bf16 = bf16 - self.fp16 = fp16 - self.fp32 = fp32 - self.kv_channels = kv_channels - self.rotary_pct = rotary_pct - self.rotary_emb_base = rotary_emb_base - self.use_dynamic_ntk = use_dynamic_ntk - self.use_logn_attn = use_logn_attn - self.use_flash_attn = use_flash_attn - self.no_bias = no_bias - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/vllm/transformers_utils/configs/yi.py b/vllm/transformers_utils/configs/yi.py deleted file mode 100644 index 359922ed269522acc8f55f78e9b6e4d3291186f9..0000000000000000000000000000000000000000 --- a/vllm/transformers_utils/configs/yi.py +++ /dev/null @@ -1,64 +0,0 @@ -""" Yi model configuration""" -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - -logger = logging.get_logger(__name__) - -Yi_PRETRAINED_CONFIG_ARCHIVE_MAP = {} - - -class YiConfig(PretrainedConfig): - r""" - Reference: - https://huggingface.co/01-ai/Yi-6B/blob/main/configuration_yi.py - """ - model_type = "Yi" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=64000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=4, - hidden_act="silu", - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - output_attentions=False, - rope_theta=5000000.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.output_attentions = output_attentions - self.rope_theta = rope_theta - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py deleted file mode 100644 index 6edc225cdfc802c6fdda3d23f3ef0530d77fec74..0000000000000000000000000000000000000000 --- a/vllm/transformers_utils/tokenizer.py +++ /dev/null @@ -1,245 +0,0 @@ -from typing import List, Optional, Tuple, Union - -from transformers import (AutoTokenizer, PreTrainedTokenizer, - PreTrainedTokenizerFast) - -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.utils import make_async, LRUCache -from vllm.transformers_utils.tokenizers import * - -logger = init_logger(__name__) - - -def get_tokenizer( - tokenizer_name: str, - *args, - tokenizer_mode: str = "auto", - trust_remote_code: bool = False, - tokenizer_revision: Optional[str] = None, - **kwargs, -) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: - """Gets a tokenizer for the given model name via Huggingface.""" - if tokenizer_mode == "slow": - if kwargs.get("use_fast", False): - raise ValueError( - "Cannot use the fast tokenizer in slow tokenizer mode.") - kwargs["use_fast"] = False - - try: - tokenizer = AutoTokenizer.from_pretrained( - tokenizer_name, - *args, - trust_remote_code=trust_remote_code, - tokenizer_revision=tokenizer_revision, - **kwargs) - except ValueError as e: - # If the error pertains to the tokenizer class not existing or not - # currently being imported, suggest using the --trust-remote-code flag. - if (not trust_remote_code and - ("does not exist or is not currently imported." in str(e) - or "requires you to execute the tokenizer file" in str(e))): - err_msg = ( - "Failed to load the tokenizer. If the tokenizer is a custom " - "tokenizer not yet available in the HuggingFace transformers " - "library, consider setting `trust_remote_code=True` in LLM " - "or using the `--trust-remote-code` flag in the CLI.") - raise RuntimeError(err_msg) from e - else: - raise e - except AttributeError as e: - if "BaichuanTokenizer" in str(e): - # This is for the error "'BaichuanTokenizer' object has no - # attribute 'sp_model'". - tokenizer = BaichuanTokenizer.from_pretrained( - tokenizer_name, - *args, - trust_remote_code=trust_remote_code, - tokenizer_revision=tokenizer_revision, - **kwargs) - else: - raise e - - if not isinstance(tokenizer, PreTrainedTokenizerFast): - logger.warning( - "Using a slow tokenizer. This might cause a significant " - "slowdown. Consider using a fast tokenizer instead.") - return tokenizer - - -def get_lora_tokenizer(lora_request: LoRARequest, *args, - **kwargs) -> Optional[PreTrainedTokenizer]: - if lora_request is None: - return None - try: - tokenizer = get_tokenizer(lora_request.lora_local_path, *args, - **kwargs) - except OSError as e: - # No tokenizer was found in the LoRA folder, - # use base model tokenizer - logger.warning( - f"No tokenizer found in {lora_request.lora_local_path}, " - "using base model tokenizer instead. " - f"(Exception: {str(e)})") - tokenizer = None - return tokenizer - - -get_lora_tokenizer_async = make_async(get_lora_tokenizer) - - -class TokenizerGroup: - """A group of tokenizers that can be used for LoRA adapters.""" - - def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int, - max_input_length: Optional[int], **tokenizer_config): - self.tokenizer_id = tokenizer_id - self.tokenizer_config = tokenizer_config - self.enable_lora = enable_lora - self.max_input_length = max_input_length - self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config) - if enable_lora: - self.lora_tokenizers = LRUCache(capacity=max_num_seqs) - else: - self.lora_tokenizers = None - - def encode(self, - prompt: str, - request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: - tokenizer = self.get_lora_tokenizer(lora_request) - return tokenizer.encode(prompt) - - async def encode_async( - self, - prompt: str, - request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: - tokenizer = await self.get_lora_tokenizer_async(lora_request) - return tokenizer.encode(prompt) - - def get_lora_tokenizer( - self, - lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer": - if not lora_request or not self.enable_lora: - return self.tokenizer - if lora_request.lora_int_id not in self.lora_tokenizers: - tokenizer = (get_lora_tokenizer( - lora_request, **self.tokenizer_config) or self.tokenizer) - self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer) - return tokenizer - else: - return self.lora_tokenizers.get(lora_request.lora_int_id) - - async def get_lora_tokenizer_async( - self, - lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer": - if not lora_request or not self.enable_lora: - return self.tokenizer - if lora_request.lora_int_id not in self.lora_tokenizers: - tokenizer = (await get_lora_tokenizer_async( - lora_request, **self.tokenizer_config) or self.tokenizer) - self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer) - return tokenizer - else: - return self.lora_tokenizers.get(lora_request.lora_int_id) - - -def _convert_tokens_to_string_with_added_encoders( - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - output_tokens: List[str], - skip_special_tokens: bool, - spaces_between_special_tokens: bool, -) -> str: - # Adapted from - # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921 - # NOTE(woosuk): The following code is slow because it runs a for loop over - # the output_tokens. In Python, running a for loop over a list can be slow - # even when the loop body is very simple. - sub_texts = [] - current_sub_text = [] - all_special_tokens = set(tokenizer.all_special_tokens) - for token in output_tokens: - if skip_special_tokens and token in all_special_tokens: - continue - if token in tokenizer.get_added_vocab(): - if current_sub_text: - sub_text = tokenizer.convert_tokens_to_string(current_sub_text) - sub_texts.append(sub_text) - current_sub_text = [] - sub_texts.append(token) - else: - current_sub_text.append(token) - if current_sub_text: - sub_text = tokenizer.convert_tokens_to_string(current_sub_text) - sub_texts.append(sub_text) - if spaces_between_special_tokens: - return " ".join(sub_texts) - else: - return "".join(sub_texts) - - -# Based on -# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15 -# under Apache 2.0 license -def detokenize_incrementally( - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - all_input_ids: List[int], - prev_tokens: Optional[List[str]], - prefix_offset: int = 0, - read_offset: int = 0, - skip_special_tokens: bool = False, - spaces_between_special_tokens: bool = True, -) -> Tuple[List[str], str, int, int]: - new_token_id = all_input_ids[-1] - # This is the first iteration for this sequence - if prev_tokens is None: - new_tokens = tokenizer.convert_ids_to_tokens( - all_input_ids, skip_special_tokens=skip_special_tokens) - output_tokens = new_tokens - # 5 is an arbitrary value that should work for all - # tokenizers (bigger = more conservative). - # Subtract 1 extra to account for the generated token. - prefix_offset = max(len(output_tokens) - 6, 0) - # If the first new token is a special token, we can't skip 1 extra token - if skip_special_tokens and new_token_id in tokenizer.all_special_ids: - read_offset = max(len(output_tokens), 0) - else: - read_offset = max(len(output_tokens) - 1, 0) - else: - # Put new_token_id in a list so skip_special_tokens is respected - new_tokens = tokenizer.convert_ids_to_tokens( - [new_token_id], skip_special_tokens=skip_special_tokens) - output_tokens = prev_tokens + new_tokens - - # The prefix text is necessary only to defeat cleanup algorithms in - # the decode which decide to add a space or not depending on the - # surrounding ids. - if tokenizer.is_fast or not tokenizer.get_added_vocab(): - prefix_text = tokenizer.convert_tokens_to_string( - output_tokens[prefix_offset:read_offset]) - new_text = tokenizer.convert_tokens_to_string( - output_tokens[prefix_offset:]) - else: - prefix_text = _convert_tokens_to_string_with_added_encoders( - tokenizer, - output_tokens[prefix_offset:read_offset], - skip_special_tokens=skip_special_tokens, - spaces_between_special_tokens=spaces_between_special_tokens, - ) - new_text = _convert_tokens_to_string_with_added_encoders( - tokenizer, - output_tokens[prefix_offset:], - skip_special_tokens=skip_special_tokens, - spaces_between_special_tokens=spaces_between_special_tokens, - ) - - if len(new_text) > len(prefix_text) and not new_text.endswith("�"): - # utf-8 char at the end means it's a potential unfinished byte sequence - # from byte fallback tokenization. - # If it's in the middle, it's probably a real invalid id generated - # by the model - new_text = new_text[len(prefix_text):] - return new_tokens, new_text, read_offset, len(output_tokens) - else: - return new_tokens, "", prefix_offset, read_offset diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py deleted file mode 100644 index e6b59722c2591eca0e91d6fad22e805a022926d2..0000000000000000000000000000000000000000 --- a/vllm/transformers_utils/tokenizers/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer - -__all__ = [ - "BaichuanTokenizer", -] diff --git a/vllm/transformers_utils/tokenizers/baichuan.py b/vllm/transformers_utils/tokenizers/baichuan.py deleted file mode 100644 index 1dd241e4a5c4be968bf16a03995f2838083b69a9..0000000000000000000000000000000000000000 --- a/vllm/transformers_utils/tokenizers/baichuan.py +++ /dev/null @@ -1,263 +0,0 @@ -# yapf: disable -# Adapted from -# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py -# This includes a fix suggested in -# https://github.com/vllm-project/vllm/issues/1403#issuecomment-1767503058 -# Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved. - -import os -from shutil import copyfile -from typing import Any, Dict, List, Optional, Tuple - -import sentencepiece as spm -from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer -from transformers.utils import logging - - -logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} - -PRETRAINED_VOCAB_FILES_MAP = { - "vocab_file": {}, - "tokenizer_file": {}, -} -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} - - -class BaichuanTokenizer(PreTrainedTokenizer): - """ - Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding. - - Args: - vocab_file (`str`): - Path to the vocabulary file. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["input_ids", "attention_mask"] - - def __init__( - self, - vocab_file, - unk_token="<unk>", - bos_token="<s>", - eos_token="</s>", - pad_token=None, - sp_model_kwargs: Optional[Dict[str, Any]] = None, - add_bos_token=True, - add_eos_token=False, - clean_up_tokenization_spaces=False, - **kwargs, - ): - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - bos_token = ( - AddedToken(bos_token, lstrip=False, rstrip=False) - if isinstance(bos_token, str) - else bos_token - ) - eos_token = ( - AddedToken(eos_token, lstrip=False, rstrip=False) - if isinstance(eos_token, str) - else eos_token - ) - unk_token = ( - AddedToken(unk_token, lstrip=False, rstrip=False) - if isinstance(unk_token, str) - else unk_token - ) - pad_token = ( - AddedToken(pad_token, lstrip=False, rstrip=False) - if isinstance(pad_token, str) - else pad_token - ) - self.vocab_file = vocab_file - self.add_bos_token = add_bos_token - self.add_eos_token = add_eos_token - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - pad_token=pad_token, - add_bos_token=add_bos_token, - add_eos_token=add_eos_token, - sp_model_kwargs=self.sp_model_kwargs, - clean_up_tokenization_spaces=clean_up_tokenization_spaces, - **kwargs, - ) - - def __getstate__(self): - state = self.__dict__.copy() - state["sp_model"] = None - return state - - def __setstate__(self, d): - self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(self.vocab_file) - - @property - def vocab_size(self): - """Returns vocab size""" - return self.sp_model.get_piece_size() - - def get_vocab(self): - """Returns vocab as a dict""" - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - - def _tokenize(self, text): - """Returns a tokenized string.""" - return self.sp_model.encode(text, out_type=str) - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - return self.sp_model.piece_to_id(token) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - token = self.sp_model.IdToPiece(index) - return token - - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (string) in a single string.""" - current_sub_tokens = [] - out_string = "" - prev_is_special = False - for i, token in enumerate(tokens): - # make sure that special tokens are not decoded using sentencepiece model - if token in self.all_special_tokens: - if not prev_is_special and i != 0: - out_string += " " - out_string += self.sp_model.decode(current_sub_tokens) + token - prev_is_special = True - current_sub_tokens = [] - else: - current_sub_tokens.append(token) - prev_is_special = False - out_string += self.sp_model.decode(current_sub_tokens) - return out_string - - def save_vocabulary( - self, save_directory, filename_prefix: Optional[str] = None - ) -> Tuple[str]: - """ - Save the vocabulary and special tokens file to a directory. - - Args: - save_directory (`str`): - The directory in which to save the vocabulary. - - Returns: - `Tuple(str)`: Paths to the files saved. - """ - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - out_vocab_file = os.path.join( - save_directory, - (filename_prefix + "-" if filename_prefix else "") - + VOCAB_FILES_NAMES["vocab_file"], - ) - - if os.path.abspath(self.vocab_file) != os.path.abspath( - out_vocab_file - ) and os.path.isfile(self.vocab_file): - copyfile(self.vocab_file, out_vocab_file) - elif not os.path.isfile(self.vocab_file): - with open(out_vocab_file, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) - - return (out_vocab_file,) - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - bos_token_id = [self.bos_token_id] if self.add_bos_token else [] - eos_token_id = [self.eos_token_id] if self.add_eos_token else [] - - output = bos_token_id + token_ids_0 + eos_token_id - - if token_ids_1 is not None: - output = output + bos_token_id + token_ids_1 + eos_token_id - - return output - - def get_special_tokens_mask( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None, - already_has_special_tokens: bool = False, - ) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`List[int]`): - List of IDs. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, - token_ids_1=token_ids_1, - already_has_special_tokens=True, - ) - - bos_token_id = [1] if self.add_bos_token else [] - eos_token_id = [1] if self.add_eos_token else [] - - if token_ids_1 is None: - return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id - return ( - bos_token_id - + ([0] * len(token_ids_0)) - + eos_token_id - + bos_token_id - + ([0] * len(token_ids_1)) - + eos_token_id - ) - - def create_token_type_ids_from_sequences( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT - sequence pair mask has the following format: - - ``` - 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence | - ``` - - if token_ids_1 is None, only returns the first portion of the mask (0s). - - Args: - token_ids_0 (`List[int]`): - List of ids. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). - """ - bos_token_id = [self.bos_token_id] if self.add_bos_token else [] - eos_token_id = [self.eos_token_id] if self.add_eos_token else [] - - output = [0] * len(bos_token_id + token_ids_0 + eos_token_id) - - if token_ids_1 is not None: - output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) - - return output diff --git a/vllm/utils.py b/vllm/utils.py deleted file mode 100644 index dc817414983562c5efbe59766ed69ecf577e7487..0000000000000000000000000000000000000000 --- a/vllm/utils.py +++ /dev/null @@ -1,277 +0,0 @@ -import enum -import os -import socket -import subprocess -import uuid -from platform import uname -from typing import List, Tuple, Union -from packaging.version import parse, Version - -import psutil -import torch -import asyncio -from functools import partial -from typing import ( - Awaitable, - Callable, - TypeVar, -) -from collections import OrderedDict -from typing import Any, Hashable, Optional - -from vllm.logger import init_logger - -T = TypeVar("T") -logger = init_logger(__name__) - -STR_DTYPE_TO_TORCH_DTYPE = { - "half": torch.half, - "bfloat16": torch.bfloat16, - "float": torch.float, - "fp8_e5m2": torch.uint8, -} - - -class Device(enum.Enum): - GPU = enum.auto() - CPU = enum.auto() - - -class Counter: - - def __init__(self, start: int = 0) -> None: - self.counter = start - - def __next__(self) -> int: - i = self.counter - self.counter += 1 - return i - - def reset(self) -> None: - self.counter = 0 - - -class LRUCache: - - def __init__(self, capacity: int): - self.cache = OrderedDict() - self.capacity = capacity - - def __contains__(self, key: Hashable) -> bool: - return key in self.cache - - def __len__(self) -> int: - return len(self.cache) - - def __getitem__(self, key: Hashable) -> Any: - return self.get(key) - - def __setitem__(self, key: Hashable, value: Any) -> None: - self.put(key, value) - - def __delitem__(self, key: Hashable) -> None: - self.pop(key) - - def touch(self, key: Hashable) -> None: - self.cache.move_to_end(key) - - def get(self, key: Hashable, default_value: Optional[Any] = None) -> int: - if key in self.cache: - value = self.cache[key] - self.cache.move_to_end(key) - else: - value = default_value - return value - - def put(self, key: Hashable, value: Any) -> None: - self.cache[key] = value - self.cache.move_to_end(key) - self._remove_old_if_needed() - - def _on_remove(self, key: Hashable, value: Any): - pass - - def remove_oldest(self): - if not self.cache: - return - key, value = self.cache.popitem(last=False) - self._on_remove(key, value) - - def _remove_old_if_needed(self) -> None: - while len(self.cache) > self.capacity: - self.remove_oldest() - - def pop(self, key: int, default_value: Optional[Any] = None) -> Any: - run_on_remove = key in self.cache - value = self.cache.pop(key, default_value) - if run_on_remove: - self._on_remove(key, value) - return value - - def clear(self): - while len(self.cache) > 0: - self.remove_oldest() - self.cache.clear() - - -def is_hip() -> bool: - return torch.version.hip is not None - - -def get_max_shared_memory_bytes(gpu: int = 0) -> int: - """Returns the maximum shared memory per thread block in bytes.""" - # NOTE: This import statement should be executed lazily since - # the Neuron-X backend does not have the `cuda_utils` module. - from vllm._C import cuda_utils - - max_shared_mem = cuda_utils.get_max_shared_memory_per_block_device_attribute( - gpu) - # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py will fail - assert max_shared_mem > 0, "max_shared_mem can not be zero" - return int(max_shared_mem) - - -def get_cpu_memory() -> int: - """Returns the total CPU memory of the node in bytes.""" - return psutil.virtual_memory().total - - -def random_uuid() -> str: - return str(uuid.uuid4().hex) - - -def in_wsl() -> bool: - # Reference: https://github.com/microsoft/WSL/issues/4071 - return "microsoft" in " ".join(uname()).lower() - - -def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]: - """Take a blocking function, and run it on in an executor thread. - - This function prevents the blocking function from blocking the - asyncio event loop. - The code in this function needs to be thread safe. - """ - - def _async_wrapper(*args, **kwargs) -> asyncio.Future: - loop = asyncio.get_event_loop() - p_func = partial(func, *args, **kwargs) - return loop.run_in_executor(executor=None, func=p_func) - - return _async_wrapper - - -def get_ip() -> str: - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - s.connect(("8.8.8.8", 80)) # Doesn't need to be reachable - return s.getsockname()[0] - - -def get_distributed_init_method(ip: str, port: int) -> str: - return f"tcp://{ip}:{port}" - - -def get_open_port() -> int: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("", 0)) - return s.getsockname()[1] - - -def set_cuda_visible_devices(device_ids: List[int]) -> None: - os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, device_ids)) - - -def get_nvcc_cuda_version() -> Version: - cuda_home = os.environ.get('CUDA_HOME') - if not cuda_home: - cuda_home = '/usr/local/cuda' - logger.info( - f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.' - ) - nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], - universal_newlines=True) - output = nvcc_output.split() - release_idx = output.index("release") + 1 - nvcc_cuda_version = parse(output[release_idx].split(",")[0]) - return nvcc_cuda_version - - -def _generate_random_fp8_e5m2( - tensor: torch.tensor, - low: float, - high: float, -) -> None: - # NOTE(zhaoyang): Due to NaN and Inf representation for fp8 data type, - # it may occur Inf or NaN if we directly use torch.randint - # to generate random data for fp8 data. - # For example, s.11111.00 in fp8e5m2 format repesents Inf. - # | E4M3 | E5M2 - #-----|-------------|------------------- - # Inf | N/A | s.11111.00 - # NaN | s.1111.111 | s.11111.{01,10,11} - from vllm._C import cache_ops - tensor_tmp = torch.empty_like(tensor, dtype=torch.float16) - tensor_tmp.uniform_(low, high) - cache_ops.convert_fp8_e5m2(tensor_tmp, tensor) - del tensor_tmp - - -def create_kv_caches_with_random( - num_blocks: int, - block_size: int, - num_layers: int, - num_heads: int, - head_size: int, - cache_dtype: Optional[Union[str, torch.dtype]], - model_dtype: Optional[Union[str, torch.dtype]] = None, - seed: Optional[int] = 0, - device: Optional[str] = "cuda", -) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: - torch.random.manual_seed(seed) - torch.cuda.manual_seed(seed) - - if isinstance(cache_dtype, str): - if cache_dtype == "auto": - if isinstance(model_dtype, str): - torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype] - elif isinstance(model_dtype, torch.dtype): - torch_dtype = model_dtype - else: - raise ValueError(f"Invalid model dtype: {model_dtype}") - elif cache_dtype in ["half", "bfloat16", "float"]: - torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] - elif cache_dtype == "fp8_e5m2": - torch_dtype = torch.uint8 - else: - raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") - elif isinstance(cache_dtype, torch.dtype): - torch_dtype = cache_dtype - else: - raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") - - scale = head_size**-0.5 - x = 16 // torch.tensor([], dtype=torch_dtype).element_size() - key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) - key_caches = [] - for _ in range(num_layers): - key_cache = torch.empty(size=key_cache_shape, - dtype=torch_dtype, - device=device) - if cache_dtype in ["auto", "half", "bfloat16", "float"]: - key_cache.uniform_(-scale, scale) - elif cache_dtype == 'fp8_e5m2': - _generate_random_fp8_e5m2(key_cache, -scale, scale) - key_caches.append(key_cache) - - value_cache_shape = (num_blocks, num_heads, head_size, block_size) - value_caches = [] - for _ in range(num_layers): - value_cache = torch.empty(size=value_cache_shape, - dtype=torch_dtype, - device=device) - if cache_dtype in ["auto", "half", "bfloat16", "float"]: - value_cache.uniform_(-scale, scale) - elif cache_dtype == 'fp8_e5m2': - _generate_random_fp8_e5m2(value_cache, -scale, scale) - value_caches.append(value_cache) - return key_caches, value_caches diff --git a/vllm/worker/__init__.py b/vllm/worker/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py deleted file mode 100644 index f57e1ed75803dc0b5dfd480b01991b128006ac9a..0000000000000000000000000000000000000000 --- a/vllm/worker/cache_engine.py +++ /dev/null @@ -1,169 +0,0 @@ -"""CacheEngine class for managing the KV cache.""" -from typing import Dict, List, Tuple - -import torch - -from vllm._C import cache_ops -from vllm.config import CacheConfig, ModelConfig, ParallelConfig -from vllm.logger import init_logger -from vllm.utils import in_wsl, STR_DTYPE_TO_TORCH_DTYPE - -logger = init_logger(__name__) - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class CacheEngine: - """Manages the KV cache. - - This class is responsible for initializing and managing the GPU and CPU KV - caches. It also provides methods for performing KV cache operations, such - as swapping and copying. - """ - - def __init__( - self, - cache_config: CacheConfig, - model_config: ModelConfig, - parallel_config: ParallelConfig, - ) -> None: - self.cache_config = cache_config - self.model_config = model_config - self.parallel_config = parallel_config - - self.head_size = model_config.get_head_size() - self.num_layers = model_config.get_num_layers(parallel_config) - self.num_heads = model_config.get_num_kv_heads(parallel_config) - - self.block_size = cache_config.block_size - self.num_gpu_blocks = cache_config.num_gpu_blocks - self.num_cpu_blocks = cache_config.num_cpu_blocks - - if cache_config.cache_dtype == "auto": - self.dtype = model_config.dtype - else: - self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] - - # Initialize the cache. - self.gpu_cache = self.allocate_gpu_cache() - self.cpu_cache = self.allocate_cpu_cache() - - # Initialize the stream for caching operations. - self.cache_stream = torch.cuda.Stream() - assert self.cache_stream != torch.cuda.current_stream() - # Initialize the events for stream synchronization. - self.events = [torch.cuda.Event() for _ in range(self.num_layers)] - - def get_key_block_shape(self) -> Tuple[int, int, int, int]: - element_size = torch.tensor([], dtype=self.dtype).element_size() - x = 16 // element_size - return ( - self.num_heads, - self.head_size // x, - self.block_size, - x, - ) - - def get_value_block_shape(self) -> Tuple[int, int, int]: - return ( - self.num_heads, - self.head_size, - self.block_size, - ) - - def allocate_gpu_cache(self) -> List[KVCache]: - gpu_cache: List[KVCache] = [] - key_block_shape = self.get_key_block_shape() - value_block_shape = self.get_value_block_shape() - for _ in range(self.num_layers): - key_blocks = torch.empty( - size=(self.num_gpu_blocks, *key_block_shape), - dtype=self.dtype, - device="cuda", - ) - value_blocks = torch.empty( - size=(self.num_gpu_blocks, *value_block_shape), - dtype=self.dtype, - device="cuda", - ) - gpu_cache.append((key_blocks, value_blocks)) - return gpu_cache - - def allocate_cpu_cache(self) -> List[KVCache]: - cpu_cache: List[KVCache] = [] - key_block_shape = self.get_key_block_shape() - value_block_shape = self.get_value_block_shape() - pin_memory = not in_wsl() - if not pin_memory: - # Pinning memory in WSL is not supported. - # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications - logger.warning("Using 'pin_memory=False' as WSL is detected. " - "This may slow down the performance.") - for _ in range(self.num_layers): - key_blocks = torch.empty( - size=(self.num_cpu_blocks, *key_block_shape), - dtype=self.dtype, - pin_memory=pin_memory, - ) - value_blocks = torch.empty( - size=(self.num_cpu_blocks, *value_block_shape), - dtype=self.dtype, - pin_memory=pin_memory, - ) - cpu_cache.append((key_blocks, value_blocks)) - return cpu_cache - - def _swap( - self, - src: List[KVCache], - dst: List[KVCache], - src_to_dst: Dict[int, int], - ) -> None: - with torch.cuda.stream(self.cache_stream): - for i in range(self.num_layers): - src_key_cache, src_value_cache = src[i] - dst_key_cache, dst_value_cache = dst[i] - # Copy the key blocks. - cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) - # Copy the value blocks. - cache_ops.swap_blocks(src_value_cache, dst_value_cache, - src_to_dst) - event = self.events[i] - event.record(stream=self.cache_stream) - - def swap_in(self, src_to_dst: Dict[int, int]) -> None: - self._swap(self.cpu_cache, self.gpu_cache, src_to_dst) - - def swap_out(self, src_to_dst: Dict[int, int]) -> None: - self._swap(self.gpu_cache, self.cpu_cache, src_to_dst) - - def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: - key_caches = [key_cache for key_cache, _ in self.gpu_cache] - value_caches = [value_cache for _, value_cache in self.gpu_cache] - # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU. - cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts) - - @staticmethod - def get_cache_block_size( - block_size: int, - cache_dtype: str, - model_config: ModelConfig, - parallel_config: ParallelConfig, - ) -> int: - head_size = model_config.get_head_size() - num_heads = model_config.get_num_kv_heads(parallel_config) - num_layers = model_config.get_num_layers(parallel_config) - - key_cache_block = block_size * num_heads * head_size - value_cache_block = key_cache_block - total = num_layers * (key_cache_block + value_cache_block) - if cache_dtype == "auto": - dtype = model_config.dtype - else: - dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] - dtype_size = _get_dtype_size(dtype) - return dtype_size * total - - -def _get_dtype_size(dtype: torch.dtype) -> int: - return torch.tensor([], dtype=dtype).element_size() diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py deleted file mode 100644 index 2a12152a708634fecdafa08f9a0ef71d781d8c81..0000000000000000000000000000000000000000 --- a/vllm/worker/model_runner.py +++ /dev/null @@ -1,813 +0,0 @@ -import time -from typing import Dict, List, Optional, Tuple, Set, Union - -import numpy as np -import torch -import torch.nn as nn - -from vllm.config import ModelConfig, LoRAConfig, ParallelConfig, SchedulerConfig -from vllm.logger import init_logger -from vllm.model_executor import get_model, InputMetadata, SamplingMetadata -from vllm.model_executor.parallel_utils.communication_op import ( - broadcast_tensor_dict) -from vllm.model_executor.parallel_utils import custom_all_reduce -from vllm.sampling_params import SamplingParams, SamplingType -from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata -from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager -from vllm.lora.layers import LoRAMapping -from vllm.lora.request import LoRARequest -from vllm.utils import in_wsl - -logger = init_logger(__name__) - -KVCache = Tuple[torch.Tensor, torch.Tensor] -_PAD_SLOT_ID = -1 -LORA_WARMUP_RANK = 8 -# Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256. -# NOTE: _get_graph_batch_size needs to be updated if this list is changed. -_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)] - - -class ModelRunner: - - def __init__( - self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - lora_config: Optional[LoRAConfig], - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - ): - self.model_config = model_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.lora_config = lora_config - self.is_driver_worker = is_driver_worker - - # model_config can be None in tests/samplers/test_sampler.py. - # FIXME(woosuk): This is a hack to make the tests work. Refactor this. - self.sliding_window = (model_config.get_sliding_window() - if model_config is not None else None) - self.device = torch.device(torch.cuda.current_device()) - self.model = None - self.block_size = None # Set after initial profiling. - self.lora_manager = None - - self.graph_runners: Dict[int, CUDAGraphRunner] = {} - self.graph_memory_pool = None # Set during graph capture. - - self.max_context_len_to_capture = ( - self.model_config.max_context_len_to_capture - if self.model_config is not None else 0) - # When using CUDA graph, the input block tables must be padded to - # max_context_len_to_capture. However, creating the block table in - # Python can be expensive. To optimize this, we cache the block table - # in numpy and only copy the actual input content at every iteration. - # The shape of the cached block table will be - # (max batch size to capture, max context len to capture / block size). - self.graph_block_tables = None # Set after initial profiling. - # cache in_wsl result - self.in_wsl = in_wsl() - self.kv_cache_dtype = kv_cache_dtype - - def load_model(self) -> None: - self.model = get_model(self.model_config, self.lora_config) - - vocab_size = self.model.config.vocab_size - - if self.lora_config: - self.lora_manager = LRUCacheWorkerLoRAManager( - self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens + - self.scheduler_config.max_paddings, vocab_size, - self.lora_config, self.device) - self.model = self.lora_manager.create_lora_manager(self.model) - - def set_block_size(self, block_size: int) -> None: - self.block_size = block_size - - max_num_blocks = (self.max_context_len_to_capture + block_size - - 1) // block_size - self.graph_block_tables = np.zeros( - (max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), dtype=np.int32) - - def _prepare_prompt( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, List[int], List[int], - List[int], List[int], Set[LoRARequest]]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[List[int]] = [] - input_positions: List[List[int]] = [] - slot_mapping: List[List[int]] = [] - lora_index_mapping: List[int] = [] - lora_prompt_mapping: List[int] = [] - lora_requests: Set[LoRARequest] = set() - - prompt_lens: List[int] = [] - context_lens: List[int] = [] - subquery_lens: List[int] = [] - prefix_block_tables: List[List[int]] = [] - for seq_group_metadata in seq_group_metadata_list: - assert seq_group_metadata.is_prompt - seq_ids = list(seq_group_metadata.seq_data.keys()) - assert len(seq_ids) == 1 - seq_id = seq_ids[0] - - seq_data = seq_group_metadata.seq_data[seq_id] - prompt_tokens = seq_data.get_token_ids() - prompt_len = len(prompt_tokens) - prompt_lens.append(prompt_len) - prefix_len = 0 - prefix = seq_group_metadata.prefix - if prefix is not None and prefix.computed: - prefix_len = prefix.get_length() - prompt_tokens = prompt_tokens[prefix_len:] - prefix_block_tables.append(prefix.get_block_numbers()) - else: - prefix_block_tables.append([]) - # actual prompt lens - context_lens.append(prefix_len) - subquery_lens.append(prompt_len - prefix_len) - - input_tokens.append(prompt_tokens) - # NOTE(woosuk): Here we assume that the first token in the prompt - # is always the first token in the sequence. - input_positions.append( - list(range(prefix_len, prefix_len + len(prompt_tokens)))) - - lora_id = seq_group_metadata.lora_int_id - - if lora_id > 0: - lora_requests.add(seq_group_metadata.lora_request) - - lora_index_mapping.append([lora_id] * prompt_len) - lora_prompt_mapping.extend( - [lora_id] * - (prompt_len - if seq_group_metadata.sampling_params.prompt_logprobs else 1)) - - if seq_group_metadata.block_tables is None: - # During memory profiling, the block tables are not initialized - # yet. In this case, we just use a dummy slot mapping. - slot_mapping.append([_PAD_SLOT_ID] * prompt_len) - continue - - # Compute the slot mapping. - slot_mapping.append([]) - block_table = seq_group_metadata.block_tables[seq_id] - # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, - # where start_idx is max(0, prompt_len - sliding_window). - # For example, if the prompt len is 10, sliding window is 8, and - # block size is 4, the first two tokens are masked and the slot - # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. - start_idx = 0 - if self.sliding_window is not None: - assert prefix_len == 0, ( - "Prefix caching is currently not supported with " - "sliding window attention") - start_idx = max(0, prompt_len - self.sliding_window) - for i in range(prefix_len, prompt_len): - if i < start_idx: - slot_mapping[-1].append(_PAD_SLOT_ID) - continue - - block_number = block_table[i // self.block_size] - block_offset = i % self.block_size - slot = block_number * self.block_size + block_offset - slot_mapping[-1].append(slot) - - max_prompt_len = max(subquery_lens) - input_tokens = _make_tensor_with_pad(input_tokens, - max_prompt_len, - pad=0, - dtype=torch.long) - input_positions = _make_tensor_with_pad(input_positions, - max_prompt_len, - pad=0, - dtype=torch.long) - slot_mapping = _make_tensor_with_pad(slot_mapping, - max_prompt_len, - pad=_PAD_SLOT_ID, - dtype=torch.long) - lora_index_mapping = [ - _pad_to_max(mapping, max_prompt_len, pad=0) - for mapping in lora_index_mapping - ] - context_lens_tensor = torch.tensor(context_lens, - dtype=torch.int, - device='cuda') - # Prepare prefix block tables - max_prompt_block_table_len = max(len(t) for t in prefix_block_tables) - block_tables = _make_tensor_with_pad( - prefix_block_tables, - max_len=max_prompt_block_table_len, - pad=0, - dtype=torch.int, - ) - start_loc_tensor = torch.arange(0, - len(prompt_lens) * max_prompt_len, - max_prompt_len, - dtype=torch.long, - device='cuda') - prompt_lens_tensor = torch.tensor(prompt_lens, - dtype=torch.long, - device='cuda') - - input_metadata = InputMetadata( - is_prompt=True, - slot_mapping=slot_mapping, - prompt_lens=prompt_lens_tensor, - max_seq_len=max_prompt_len, - start_loc=start_loc_tensor, - max_context_len=None, - context_lens=context_lens_tensor, - block_tables=block_tables, - use_cuda_graph=False, - kv_cache_dtype=self.kv_cache_dtype, - ) - return (input_tokens, input_positions, input_metadata, prompt_lens, - subquery_lens, lora_index_mapping, lora_prompt_mapping, - lora_requests) - - def _prepare_decode( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, List[int], List[int], - Set[LoRARequest]]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[List[int]] = [] - input_positions: List[List[int]] = [] - slot_mapping: List[List[int]] = [] - context_lens: List[int] = [] - block_tables: List[List[int]] = [] - lora_index_mapping: List[int] = [] - lora_prompt_mapping: List[int] = [] - lora_requests: Set[LoRARequest] = set() - - for seq_group_metadata in seq_group_metadata_list: - assert not seq_group_metadata.is_prompt - - seq_ids = list(seq_group_metadata.seq_data.keys()) - lora_id = seq_group_metadata.lora_int_id - - if lora_id > 0: - lora_requests.add(seq_group_metadata.lora_request) - - for seq_id in seq_ids: - seq_data = seq_group_metadata.seq_data[seq_id] - generation_token = seq_data.get_last_token_id() - input_tokens.append([generation_token]) - - seq_len = seq_data.get_len() - position = seq_len - 1 - input_positions.append([position]) - - context_len = seq_len if self.sliding_window is None else min( - seq_len, self.sliding_window) - context_lens.append(context_len) - - block_table = seq_group_metadata.block_tables[seq_id] - block_number = block_table[position // self.block_size] - block_offset = position % self.block_size - slot = block_number * self.block_size + block_offset - slot_mapping.append([slot]) - lora_index_mapping.append([lora_id]) - lora_prompt_mapping.append(lora_id) - - if self.sliding_window is not None: - sliding_window_blocks = (self.sliding_window // - self.block_size) - block_table = block_table[-sliding_window_blocks:] - block_tables.append(block_table) - - batch_size = len(input_tokens) - max_context_len = max(context_lens) - use_captured_graph = ( - not self.model_config.enforce_eager - and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] - and max_context_len <= self.max_context_len_to_capture) - if use_captured_graph: - # Pad the input tokens, positions, and slot mapping to match the - # batch size of the captured graph. - graph_batch_size = _get_graph_batch_size(batch_size) - assert graph_batch_size >= batch_size - for _ in range(graph_batch_size - batch_size): - input_tokens.append([]) - input_positions.append([]) - slot_mapping.append([]) - context_lens.append(1) - block_tables.append([]) - batch_size = graph_batch_size - - input_tokens = _make_tensor_with_pad(input_tokens, - max_len=1, - pad=0, - dtype=torch.long, - device="cuda") - input_positions = _make_tensor_with_pad(input_positions, - max_len=1, - pad=0, - dtype=torch.long, - device="cuda") - slot_mapping = _make_tensor_with_pad(slot_mapping, - max_len=1, - pad=_PAD_SLOT_ID, - dtype=torch.long, - device="cuda") - context_lens = torch.tensor(context_lens, - dtype=torch.int, - device="cuda") - - if use_captured_graph: - # The shape of graph_block_tables is - # [max batch size, max context len // block size]. - input_block_tables = self.graph_block_tables[:batch_size] - for i, block_table in enumerate(block_tables): - if block_table: - input_block_tables[i, :len(block_table)] = block_table - block_tables = torch.tensor(input_block_tables, device="cuda") - else: - max_block_table_len = max( - len(block_table) for block_table in block_tables) - block_tables = _make_tensor_with_pad( - block_tables, - max_len=max_block_table_len, - pad=0, - dtype=torch.int, - device="cuda", - ) - - lora_index_mapping = [ - _pad_to_max(mapping, 1, pad=0) for mapping in lora_index_mapping - ] - - input_metadata = InputMetadata( - is_prompt=False, - slot_mapping=slot_mapping, - prompt_lens=None, - max_seq_len=None, - start_loc=None, - max_context_len=max_context_len, - context_lens=context_lens, - block_tables=block_tables, - use_cuda_graph=use_captured_graph, - kv_cache_dtype=self.kv_cache_dtype, - ) - return input_tokens, input_positions, input_metadata, lora_index_mapping, lora_prompt_mapping, lora_requests - - def _prepare_sample( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - prompt_lens: List[int], - subquery_lens: Optional[List[int]], - ) -> SamplingMetadata: - seq_groups: List[Tuple[List[int], SamplingParams]] = [] - selected_token_indices: List[int] = [] - selected_token_start_idx = 0 - categorized_sample_indices = {t: [] for t in SamplingType} - categorized_sample_indices_start_idx = 0 - - max_subquery_len = max(subquery_lens) if subquery_lens else 1 - for i, seq_group_metadata in enumerate(seq_group_metadata_list): - seq_ids = list(seq_group_metadata.seq_data.keys()) - sampling_params = seq_group_metadata.sampling_params - seq_groups.append((seq_ids, sampling_params)) - - if seq_group_metadata.is_prompt: - assert len(seq_ids) == 1 - assert subquery_lens is not None - subquery_len = subquery_lens[i] - if sampling_params.prompt_logprobs is not None: - # NOTE: prompt token positions do not need sample, skip - categorized_sample_indices_start_idx += subquery_len - 1 - - categorized_sample_indices[ - sampling_params.sampling_type].append( - categorized_sample_indices_start_idx) - categorized_sample_indices_start_idx += 1 - - if sampling_params.prompt_logprobs is not None: - selected_token_indices.extend( - range(selected_token_start_idx, - selected_token_start_idx + subquery_len - 1)) - selected_token_indices.append(selected_token_start_idx + - subquery_len - 1) - selected_token_start_idx += max_subquery_len - else: - num_seqs = len(seq_ids) - selected_token_indices.extend( - range(selected_token_start_idx, - selected_token_start_idx + num_seqs)) - selected_token_start_idx += num_seqs - - categorized_sample_indices[ - sampling_params.sampling_type].extend( - range(categorized_sample_indices_start_idx, - categorized_sample_indices_start_idx + num_seqs)) - categorized_sample_indices_start_idx += num_seqs - - selected_token_indices = _async_h2d(selected_token_indices, - dtype=torch.long, - pin_memory=not self.in_wsl) - categorized_sample_indices = { - t: _async_h2d(seq_ids, dtype=torch.int, pin_memory=not self.in_wsl) - for t, seq_ids in categorized_sample_indices.items() - } - - seq_data: Dict[int, SequenceData] = {} - for seq_group_metadata in seq_group_metadata_list: - seq_data.update(seq_group_metadata.seq_data) - - sampling_metadata = SamplingMetadata( - seq_groups=seq_groups, - seq_data=seq_data, - prompt_lens=prompt_lens, - selected_token_indices=selected_token_indices, - categorized_sample_indices=categorized_sample_indices, - ) - return sampling_metadata - - def prepare_input_tensors( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, SamplingMetadata, - Set[int], LoRAMapping]: - if self.is_driver_worker: - # NOTE: We assume that all sequences in the group are all prompts or - # all decodes. - is_prompt = seq_group_metadata_list[0].is_prompt - # Prepare input tensors. - if is_prompt: - (input_tokens, input_positions, input_metadata, prompt_lens, - subquery_lens, lora_index_mapping, lora_prompt_mapping, - lora_requests) = self._prepare_prompt(seq_group_metadata_list) - else: - (input_tokens, input_positions, input_metadata, - lora_index_mapping, lora_prompt_mapping, - lora_requests) = self._prepare_decode(seq_group_metadata_list) - prompt_lens = [] - subquery_lens = None - sampling_metadata = self._prepare_sample(seq_group_metadata_list, - prompt_lens, - subquery_lens) - - if self.lora_config: - flat_lora_index_mapping = [ - item for sublist in lora_index_mapping for item in sublist - ] - lora_mapping = LoRAMapping( - flat_lora_index_mapping, - lora_prompt_mapping, - ) - else: - lora_mapping = None - - # Broadcast the metadata. - metadata_dict = { - "input_tokens": input_tokens, - "input_positions": input_positions, - "is_prompt": input_metadata.is_prompt, - "slot_mapping": input_metadata.slot_mapping, - "prompt_lens": input_metadata.prompt_lens, - "max_seq_len": input_metadata.max_seq_len, - "start_loc": input_metadata.start_loc, - "max_context_len": input_metadata.max_context_len, - "context_lens": input_metadata.context_lens, - "block_tables": input_metadata.block_tables, - "use_cuda_graph": input_metadata.use_cuda_graph, - "kv_cache_dtype": input_metadata.kv_cache_dtype, - "selected_token_indices": - sampling_metadata.selected_token_indices, - "lora_requests": lora_requests, - "lora_mapping": lora_mapping, - } - broadcast_tensor_dict(metadata_dict, src=0) - else: - metadata_dict = broadcast_tensor_dict(src=0) - input_tokens = metadata_dict["input_tokens"] - input_positions = metadata_dict["input_positions"] - lora_mapping = metadata_dict["lora_mapping"] - lora_requests = metadata_dict["lora_requests"] - input_metadata = InputMetadata( - is_prompt=metadata_dict["is_prompt"], - slot_mapping=metadata_dict["slot_mapping"], - prompt_lens=metadata_dict["prompt_lens"], - max_seq_len=metadata_dict["max_seq_len"], - start_loc=metadata_dict["start_loc"], - max_context_len=metadata_dict["max_context_len"], - context_lens=metadata_dict["context_lens"], - block_tables=metadata_dict["block_tables"], - use_cuda_graph=metadata_dict["use_cuda_graph"], - kv_cache_dtype=metadata_dict["kv_cache_dtype"], - ) - sampling_metadata = SamplingMetadata( - seq_groups=None, - seq_data=None, - prompt_lens=None, - selected_token_indices=metadata_dict["selected_token_indices"], - categorized_sample_indices=None, - perform_sampling=False, - ) - - return input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests, lora_mapping - - @torch.inference_mode() - def execute_model( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - ) -> Optional[SamplerOutput]: - input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests, lora_mapping = ( - self.prepare_input_tensors(seq_group_metadata_list)) - - if self.lora_config: - self.set_active_loras(lora_requests, lora_mapping) - - # Execute the model. - if input_metadata.use_cuda_graph: - graph_batch_size = input_tokens.shape[0] - model_executable = self.graph_runners[graph_batch_size] - else: - model_executable = self.model - hidden_states = model_executable( - input_ids=input_tokens, - positions=input_positions, - kv_caches=kv_caches, - input_metadata=input_metadata, - ) - - # Sample the next token. - output = self.model.sample( - hidden_states=hidden_states, - sampling_metadata=sampling_metadata, - ) - return output - - @torch.inference_mode() - def profile_run(self) -> None: - # Enable top-k sampling to reflect the accurate memory usage. - vocab_size = self.model_config.get_vocab_size() - sampling_params = SamplingParams(top_p=0.99, top_k=vocab_size - 1) - max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens - max_num_seqs = self.scheduler_config.max_num_seqs - - # This represents the maximum number of different requests - # that will have unique loras, an therefore the max amount of memory - # consumption create dummy lora request copies from the lora request - # passed in, which contains a lora from the lora warmup path. - dummy_lora_requests = [] - dummy_lora_requests_per_seq = [] - if self.lora_config: - for idx in range(self.lora_config.max_loras): - lora_id = idx + 1 - dummy_lora_request = LoRARequest( - lora_name=f"warmup_{lora_id}", - lora_int_id=lora_id, - lora_local_path="/not/a/real/path", - ) - self.lora_manager.add_dummy_lora(dummy_lora_request, - rank=LORA_WARMUP_RANK) - dummy_lora_requests.append(dummy_lora_request) - dummy_lora_requests_per_seq = [ - dummy_lora_requests[idx % len(dummy_lora_requests)] - for idx in range(max_num_seqs) - ] - - # Profile memory usage with max_num_sequences sequences and the total - # number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] - for group_id in range(max_num_seqs): - seq_len = (max_num_batched_tokens // max_num_seqs + - (group_id < max_num_batched_tokens % max_num_seqs)) - seq_data = SequenceData([0] * seq_len) - seq = SequenceGroupMetadata( - request_id=str(group_id), - is_prompt=True, - seq_data={group_id: seq_data}, - sampling_params=sampling_params, - block_tables=None, - lora_request=dummy_lora_requests_per_seq[group_id] - if dummy_lora_requests_per_seq else None, - ) - seqs.append(seq) - - # Run the model with the dummy inputs. - num_layers = self.model_config.get_num_layers(self.parallel_config) - kv_caches = [(None, None)] * num_layers - self.execute_model(seqs, kv_caches) - torch.cuda.synchronize() - return - - def remove_all_loras(self) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.remove_all_loras() - - def set_active_loras(self, lora_requests: List[LoRARequest], - lora_mapping: LoRAMapping) -> None: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - self.lora_manager.set_active_loras(lora_requests, lora_mapping) - - def add_lora(self, lora_request: LoRARequest) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.remove_lora(lora_id) - - def list_loras(self) -> Set[int]: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.list_loras() - - @torch.inference_mode() - def capture_model(self, kv_caches: List[KVCache]) -> None: - assert not self.model_config.enforce_eager - logger.info("Capturing the model for CUDA graphs. This may lead to " - "unexpected consequences if the model is not static. To " - "run the model in eager mode, set 'enforce_eager=True' or " - "use '--enforce-eager' in the CLI.") - logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. " - "If you are running out of memory, consider decreasing " - "`gpu_memory_utilization` or enforcing eager mode. " - "You can also reduce the `max_num_seqs` as needed " - "to decrease memory usage.") - start_time = time.perf_counter() - - # Prepare dummy inputs. These will be reused for all batch sizes. - max_batch_size = max(_BATCH_SIZES_TO_CAPTURE) - input_tokens = torch.zeros(max_batch_size, 1, dtype=torch.long).cuda() - input_positions = torch.zeros(max_batch_size, 1, - dtype=torch.long).cuda() - slot_mapping = torch.empty(max_batch_size, 1, dtype=torch.long).cuda() - slot_mapping.fill_(_PAD_SLOT_ID) - context_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda() - block_tables = torch.from_numpy(self.graph_block_tables).cuda() - - graph_batch_size = _get_graph_batch_size( - self.scheduler_config.max_num_seqs) - batch_size_capture_list = [ - bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size - ] - - # NOTE: Capturing the largest batch size first may help reduce the - # memory usage of CUDA graph. - with custom_all_reduce.capture(): - for batch_size in reversed(batch_size_capture_list): - # Create dummy input_metadata. - input_metadata = InputMetadata( - is_prompt=False, - slot_mapping=slot_mapping[:batch_size], - prompt_lens=None, - max_seq_len=None, - start_loc=None, - max_context_len=self.max_context_len_to_capture, - context_lens=context_lens[:batch_size], - block_tables=block_tables[:batch_size], - use_cuda_graph=True, - kv_cache_dtype=self.kv_cache_dtype, - ) - - if self.lora_config: - lora_mapping = LoRAMapping( - [0] * batch_size, - [0] * batch_size, - ) - self.set_active_loras(set(), lora_mapping) - - graph_runner = CUDAGraphRunner(self.model) - graph_runner.capture( - input_tokens[:batch_size], - input_positions[:batch_size], - kv_caches, - input_metadata, - memory_pool=self.graph_memory_pool, - ) - self.graph_memory_pool = graph_runner.graph.pool() - self.graph_runners[batch_size] = graph_runner - - end_time = time.perf_counter() - elapsed_time = end_time - start_time - # This usually takes < 10 seconds. - logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs.") - - -class CUDAGraphRunner: - - def __init__(self, model: nn.Module): - self.model = model - self.graph = None - self.input_buffers: Dict[str, torch.Tensor] = {} - self.output_buffers: Dict[str, torch.Tensor] = {} - - def capture( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - memory_pool, - ) -> None: - assert self.graph is None - # Run the model once without capturing the graph. - # This is to make sure that the captured graph does not include the - # kernel launches for initial benchmarking (e.g., Triton autotune). - self.model( - input_ids, - positions, - kv_caches, - input_metadata, - ) - torch.cuda.synchronize() - - # Capture the graph. - self.graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(self.graph, pool=memory_pool): - hidden_states = self.model( - input_ids, - positions, - kv_caches, - input_metadata, - ) - torch.cuda.synchronize() - - # Save the input and output buffers. - self.input_buffers = { - "input_ids": input_ids, - "positions": positions, - "kv_caches": kv_caches, - "slot_mapping": input_metadata.slot_mapping, - "context_lens": input_metadata.context_lens, - "block_tables": input_metadata.block_tables, - } - self.output_buffers = {"hidden_states": hidden_states} - return - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - input_metadata: InputMetadata, - ) -> torch.Tensor: - # KV caches are fixed tensors, so we don't need to copy them. - del kv_caches - - # Copy the input tensors to the input buffers. - self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True) - self.input_buffers["positions"].copy_(positions, non_blocking=True) - self.input_buffers["slot_mapping"].copy_(input_metadata.slot_mapping, - non_blocking=True) - self.input_buffers["context_lens"].copy_(input_metadata.context_lens, - non_blocking=True) - self.input_buffers["block_tables"].copy_(input_metadata.block_tables, - non_blocking=True) - - # Run the graph. - self.graph.replay() - - # Return the output tensor. - return self.output_buffers["hidden_states"] - - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) - - -def _pad_to_max(x: List[int], max_len: int, pad: int) -> List[int]: - assert len(x) <= max_len - return x + [pad] * (max_len - len(x)) - - -def _make_tensor_with_pad( - x: List[List[int]], - max_len: int, - pad: int, - dtype: torch.dtype, - device: Union[str, torch.device] = "cuda", - pin_memory: bool = False, -) -> torch.Tensor: - padded_x = [_pad_to_max(x_i, max_len, pad) for x_i in x] - return torch.tensor(padded_x, - dtype=dtype, - device=device, - pin_memory=pin_memory and str(device) == "cpu") - - -def _get_graph_batch_size(batch_size: int) -> int: - if batch_size <= 2: - return batch_size - elif batch_size <= 4: - return 4 - else: - return (batch_size + 7) // 8 * 8 - - -def _async_h2d(data: list, dtype, pin_memory): - t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory) - return t.to(device="cuda", non_blocking=True) diff --git a/vllm/worker/spec_decode/multi_step_worker.py b/vllm/worker/spec_decode/multi_step_worker.py deleted file mode 100644 index 591d1b1300c88856f31e98c2db10c1d43c9fb36d..0000000000000000000000000000000000000000 --- a/vllm/worker/spec_decode/multi_step_worker.py +++ /dev/null @@ -1,178 +0,0 @@ -from typing import List, Dict -import copy - -import torch - -from vllm.sequence import SamplerOutput, SequenceGroupMetadata -from vllm.worker.worker import Worker - - -class MultiStepWorker(Worker): - """The MultiStepWorker is equivalent to a Worker except that it allows - multiple forward passes in a single call, assuming the scheduler has - allocated enough space to store the additional KV. This reduces overhead - by invoking the scheduler less. - - The MultiStepWorker does not support cache swap operations, or beam search. - Cache swap operations do not require large modifications. On the other hand, - beam search requires memory allocations during sequence forks and thus - requires more thought for MultiStepWorker support. - """ - - @torch.inference_mode() - def execute_model_multi_step( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - num_steps: int, - ) -> List[SamplerOutput]: - """Run the model forward pass num_steps times. Returns the list of - sampler output, one per model forward pass. - """ - self._raise_if_unsupported(seq_group_metadata_list, blocks_to_swap_in, - blocks_to_swap_out, blocks_to_copy) - - # Shallow copy input data so modifications (such as appending tokens) - # do not cause side-effects. - copied_seq_group_metadata_list = self._shallow_copy_inputs( - seq_group_metadata_list) - - # Assert enough KV space for num_steps tokens per sequence. - self._assert_enough_kv_space(seq_group_metadata_list, num_steps) - - # Run model num_steps times. - model_outputs = [] - for _ in range(num_steps): - model_output = super().execute_model( - seq_group_metadata_list=copied_seq_group_metadata_list, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - ) - - self._append_new_tokens(model_output, - copied_seq_group_metadata_list) - model_outputs.append(model_output) - - return model_outputs - - def _append_new_tokens( - self, model_output: SamplerOutput, - seq_group_metadata_list: SequenceGroupMetadata) -> None: - """Given model output from a single run, append the tokens to the - sequences. This is normally done outside of the worker, but it is - required if the worker is to perform multiple forward passes. - """ - for seq_group_metadata, sequence_group_outputs in zip( - seq_group_metadata_list, model_output): - seq_group_metadata.is_prompt = False - - for seq_output in sequence_group_outputs.samples: - # NOTE: Beam search is not supported, so we can assume that - # parent_seq_id == seq_id. - seq = seq_group_metadata.seq_data[seq_output.parent_seq_id] - - token_id = seq_output.output_token - token_logprob = seq_output.logprobs[token_id] - - seq.append_token_id(token_id, token_logprob) - - def _shallow_copy_inputs( - self, seq_group_metadata_list: List[SequenceGroupMetadata] - ) -> List[SequenceGroupMetadata]: - """Copy input data structures to remove side-effects when input data - structures are shared with other modules. - - The multi-step worker must be able to append tokens to sequences after - a forward pass. This necessitates modification of the data structures - used by the worker. Since these data structures are shared with other - parts of vLLM, like the scheduler, we must take care not to introduce - unexpected side-effects. - - When Ray is used to orchestrate worker processes (such as when the - tensor-parallel degree is >1), this is not a problem because the input - datastructures will be serialized and created anew in the worker - process. - - However, when Ray is not used to orchestrate the worker processes (such - as when the tensor-parallel degree is 1), this is a problem. We avoid - the problem by shallow-copying the input datastructures (specifically, - the parts that will change in multiple steps). - """ - - # Shallow-copy the list of SequenceGroupMetadata. This allows us to - # append tokens and change is_prompt without external side-effects. - new_seq_group_metadata_list = [] - - for old_seq_group_metadata in seq_group_metadata_list: - # We must shallow-copy seq_group_metadata as is_prompt could change. - seq_group_metadata = copy.copy(old_seq_group_metadata) - new_seq_group_metadata_list.append(seq_group_metadata) - - # We must shallow-copy seq_data as we will append token ids - new_seq_data = {} - for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): - new_seq_data[seq_id] = copy.copy(old_seq_data) - new_seq_data[ - seq_id].output_token_ids = old_seq_data.output_token_ids[:] - - seq_group_metadata.seq_data = new_seq_data - - return new_seq_group_metadata_list - - def _assert_enough_kv_space( - self, seq_group_metadata_list: List[SequenceGroupMetadata], - num_steps: int) -> None: - """Assert there are enough physical blocks per sequence to store the - current KV plus additional KV from num_steps tokens. - """ - assert self.model_runner.block_size is not None - for seq_group_metadata in seq_group_metadata_list: - # Only one seq_id is guaranteed because there is no beam search. - seq_id = list(seq_group_metadata.seq_data.keys())[0] - seq = seq_group_metadata.seq_data[seq_id] - - # After num_steps, the seq len will be the current seq len - # plus one token per step. - final_seq_len = seq.get_len() + num_steps - - # We will have final_seq_len - 1 KV because vLLM saves KV for a - # token in the iteration after the token was generated. - required_num_kv_slots = final_seq_len - 1 - - # The allocated number of kv slots is the number of allocated blocks - # times the number of slots of block. - number_physical_blocks = len( - seq_group_metadata.block_tables[seq_id]) - allocated_kv_slots = (number_physical_blocks * - self.model_runner.block_size) - - if required_num_kv_slots > allocated_kv_slots: - request_id = seq_group_metadata.request_id - raise ValueError( - "The worker attempted to run " - f"{num_steps} times but found insufficient KV space for " - f"{request_id=} {seq_id=}. ({allocated_kv_slots=} " - f"{required_num_kv_slots=}).") - - def _raise_if_unsupported( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - ) -> None: - """MultiStepWorker does not yet implement support for cache swap - operations or beam search. - """ - if any([blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy]): - raise NotImplementedError( - "MultiStepWorker does not support cache operations") - - if any( - len(seq_group_metadata.seq_data.keys()) != 1 - for seq_group_metadata in seq_group_metadata_list): - raise NotImplementedError( - "MultiStepWorker does not support beam search.") diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py deleted file mode 100644 index a74adfa585611b8cbaa645513e9a1e6232d3ae76..0000000000000000000000000000000000000000 --- a/vllm/worker/worker.py +++ /dev/null @@ -1,269 +0,0 @@ -"""A GPU worker class.""" -import gc -import os -from typing import Dict, List, Tuple, Set, Optional - -import torch -import torch.distributed - -from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, - SchedulerConfig, LoRAConfig) -from vllm.model_executor import set_random_seed -from vllm.model_executor.parallel_utils.communication_op import ( - broadcast_tensor_dict) -from vllm.model_executor.parallel_utils.custom_all_reduce import init_custom_ar -from vllm.model_executor.parallel_utils.parallel_state import ( - ensure_model_parallel_initialized) -from vllm.sequence import SamplerOutput, SequenceGroupMetadata -from vllm.worker.cache_engine import CacheEngine -from vllm.worker.model_runner import ModelRunner -from vllm.lora.request import LoRARequest - - -class Worker: - """A worker class that executes (a partition of) the model on a GPU. - - Each worker is associated with a single GPU. The worker is responsible for - maintaining the KV cache and executing the model on the GPU. In case of - distributed inference, each worker is assigned a partition of the model. - """ - - def __init__( - self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - local_rank: int, - rank: int, - distributed_init_method: str, - lora_config: Optional[LoRAConfig] = None, - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - ) -> None: - self.model_config = model_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.local_rank = local_rank - self.rank = rank - self.distributed_init_method = distributed_init_method - self.lora_config = lora_config - self.is_driver_worker = is_driver_worker - if self.is_driver_worker: - assert self.rank == 0, "The driver worker must have rank 0." - - self.model_runner = ModelRunner(model_config, - parallel_config, - scheduler_config, - lora_config=self.lora_config, - kv_cache_dtype=kv_cache_dtype, - is_driver_worker=is_driver_worker) - # Uninitialized cache engine. Will be initialized by - # self.init_cache_engine(). - self.cache_config = None - self.cache_engine = None - self.cache_events = None - self.gpu_cache = None - - def init_model(self) -> None: - # torch.distributed.all_reduce does not free the input tensor until - # the synchronization point. This causes the memory usage to grow - # as the number of all_reduce calls increases. This env var disables - # this behavior. - # Related issue: - # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573 - os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" - - # This env var set by Ray causes exceptions with graph building. - os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) - self.device = torch.device(f"cuda:{self.local_rank}") - torch.cuda.set_device(self.device) - - _check_if_gpu_supports_dtype(self.model_config.dtype) - - # Initialize the distributed environment. - init_distributed_environment(self.parallel_config, self.rank, - self.distributed_init_method) - if not self.parallel_config.disable_custom_all_reduce: - init_custom_ar() - # Initialize the model. - set_random_seed(self.model_config.seed) - - def load_model(self): - self.model_runner.load_model() - - @torch.inference_mode() - def profile_num_available_blocks( - self, - block_size: int, - gpu_memory_utilization: float, - cpu_swap_space: int, - cache_dtype: str, - ) -> Tuple[int, int]: - """Profiles the peak memory usage of the model and returns the maximum - number of GPU and CPU cache blocks that can be allocated. - - Args: - block_size: The size of the cache block. - gpu_memory_utilization: The fraction of the total GPU memory to use. - cpu_swap_space: The size of the CPU swap space in bytes. - """ - # Profile the memory usage of the model and get the maximum number of - # cache blocks that can be allocated with the remaining free memory. - torch.cuda.empty_cache() - - # Execute a forward pass with dummy inputs to profile the memory usage - # of the model. - self.model_runner.profile_run() - - # Calculate the number of blocks that can be allocated with the - # profiled peak memory. - torch.cuda.synchronize() - free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info() - peak_memory = total_gpu_memory - free_gpu_memory - - cache_block_size = CacheEngine.get_cache_block_size( - block_size, cache_dtype, self.model_config, self.parallel_config) - num_gpu_blocks = int( - (total_gpu_memory * gpu_memory_utilization - peak_memory) // - cache_block_size) - num_cpu_blocks = int(cpu_swap_space // cache_block_size) - num_gpu_blocks = max(num_gpu_blocks, 0) - num_cpu_blocks = max(num_cpu_blocks, 0) - if self.model_runner.lora_manager: - self.model_runner.remove_all_loras() - gc.collect() - torch.cuda.empty_cache() - return num_gpu_blocks, num_cpu_blocks - - def init_cache_engine(self, cache_config: CacheConfig) -> None: - self.cache_config = cache_config - self.cache_engine = CacheEngine(self.cache_config, self.model_config, - self.parallel_config) - self.cache_events = self.cache_engine.events - self.gpu_cache = self.cache_engine.gpu_cache - self.model_runner.set_block_size(self.cache_engine.block_size) - - def warm_up_model(self) -> None: - if not self.model_config.enforce_eager: - self.model_runner.capture_model(self.gpu_cache) - # Reset the seed to ensure that the random state is not affected by - # the model initialization and profiling. - set_random_seed(self.model_config.seed) - - def cache_swap( - self, - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - ) -> None: - # Issue cache operations. - issued_cache_op = False - if blocks_to_swap_in: - self.cache_engine.swap_in(blocks_to_swap_in) - issued_cache_op = True - if blocks_to_swap_out: - self.cache_engine.swap_out(blocks_to_swap_out) - issued_cache_op = True - if blocks_to_copy: - self.cache_engine.copy(blocks_to_copy) - issued_cache_op = True - - cache_events = self.cache_events if issued_cache_op else None - - # Wait for cache operations to finish. - # TODO(woosuk): Profile swapping overhead and optimize if needed. - if cache_events is not None: - for event in cache_events: - event.wait() - - @torch.inference_mode() - def execute_model( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None, - blocks_to_swap_in: Optional[Dict[int, int]] = None, - blocks_to_swap_out: Optional[Dict[int, int]] = None, - blocks_to_copy: Optional[Dict[int, List[int]]] = None, - ) -> Optional[SamplerOutput]: - if self.is_driver_worker: - assert seq_group_metadata_list is not None - num_seq_groups = len(seq_group_metadata_list) - assert blocks_to_swap_in is not None - assert blocks_to_swap_out is not None - assert blocks_to_copy is not None - data = { - "num_seq_groups": num_seq_groups, - "blocks_to_swap_in": blocks_to_swap_in, - "blocks_to_swap_out": blocks_to_swap_out, - "blocks_to_copy": blocks_to_copy, - } - broadcast_tensor_dict(data, src=0) - else: - data = broadcast_tensor_dict(src=0) - num_seq_groups = data["num_seq_groups"] - blocks_to_swap_in = data["blocks_to_swap_in"] - blocks_to_swap_out = data["blocks_to_swap_out"] - blocks_to_copy = data["blocks_to_copy"] - - self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) - - # If there is no input, we don't need to execute the model. - if num_seq_groups == 0: - return {} - - output = self.model_runner.execute_model(seq_group_metadata_list, - self.gpu_cache) - return output - - def add_lora(self, lora_request: LoRARequest) -> bool: - return self.model_runner.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - return self.model_runner.remove_lora(lora_id) - - def list_loras(self) -> Set[int]: - return self.model_runner.list_loras() - - -def init_distributed_environment( - parallel_config: ParallelConfig, - rank: int, - distributed_init_method: Optional[str] = None, -) -> None: - """Initialize the distributed environment.""" - if torch.distributed.is_initialized(): - torch_world_size = torch.distributed.get_world_size() - if torch_world_size != parallel_config.world_size: - raise RuntimeError( - "torch.distributed is already initialized but the torch world " - "size does not match parallel_config.world_size " - f"({torch_world_size} vs. {parallel_config.world_size}).") - elif not distributed_init_method: - raise ValueError( - "distributed_init_method must be set if torch.distributed " - "is not already initialized") - else: - torch.distributed.init_process_group( - backend="nccl", - world_size=parallel_config.world_size, - rank=rank, - init_method=distributed_init_method, - ) - - # A small all_reduce for warmup. - torch.distributed.all_reduce(torch.zeros(1).cuda()) - ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) - - -def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): - # Check if the GPU supports the dtype. - if torch_dtype == torch.bfloat16: - compute_capability = torch.cuda.get_device_capability() - if compute_capability[0] < 8: - gpu_name = torch.cuda.get_device_name() - raise ValueError( - "Bfloat16 is only supported on GPUs with compute capability " - f"of at least 8.0. Your {gpu_name} GPU has compute capability " - f"{compute_capability[0]}.{compute_capability[1]}. " - "You can use float16 instead by explicitly setting the" - "`dtype` flag in CLI, for example: --dtype=half.")