Upload inference_vLLM.ipynb

Browse files

Files changed (1) hide show

inference_vLLM.ipynb +1349 -0

inference_vLLM.ipynb ADDED Viewed

	@@ -0,0 +1,1349 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e0538a90-61d8-4bd0-b2f7-e08e69b32295",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting vllm\n",
+      "  Downloading vllm-0.6.4.post1-cp38-abi3-manylinux1_x86_64.whl.metadata (10 kB)\n",
+      "Requirement already satisfied: psutil in /usr/local/lib/python3.11/dist-packages (from vllm) (6.0.0)\n",
+      "Collecting sentencepiece (from vllm)\n",
+      "  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)\n",
+      "Requirement already satisfied: numpy<2.0.0 in /usr/local/lib/python3.11/dist-packages (from vllm) (1.26.3)\n",
+      "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.11/dist-packages (from vllm) (2.32.3)\n",
+      "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from vllm) (4.67.1)\n",
+      "Collecting py-cpuinfo (from vllm)\n",
+      "  Downloading py_cpuinfo-9.0.0-py3-none-any.whl.metadata (794 bytes)\n",
+      "Requirement already satisfied: transformers>=4.45.2 in /usr/local/lib/python3.11/dist-packages (from vllm) (4.47.0.dev0)\n",
+      "Requirement already satisfied: tokenizers>=0.19.1 in /usr/local/lib/python3.11/dist-packages (from vllm) (0.20.3)\n",
+      "Collecting protobuf (from vllm)\n",
+      "  Downloading protobuf-5.29.1-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)\n",
+      "Requirement already satisfied: aiohttp in /usr/local/lib/python3.11/dist-packages (from vllm) (3.11.8)\n",
+      "Collecting openai>=1.45.0 (from vllm)\n",
+      "  Downloading openai-1.57.0-py3-none-any.whl.metadata (24 kB)\n",
+      "Collecting uvicorn[standard] (from vllm)\n",
+      "  Downloading uvicorn-0.32.1-py3-none-any.whl.metadata (6.6 kB)\n",
+      "Collecting pydantic>=2.9 (from vllm)\n",
+      "  Downloading pydantic-2.10.3-py3-none-any.whl.metadata (172 kB)\n",
+      "Requirement already satisfied: pillow in /usr/local/lib/python3.11/dist-packages (from vllm) (10.2.0)\n",
+      "Requirement already satisfied: prometheus-client>=0.18.0 in /usr/local/lib/python3.11/dist-packages (from vllm) (0.21.0)\n",
+      "Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)\n",
+      "  Downloading prometheus_fastapi_instrumentator-7.0.0-py3-none-any.whl.metadata (13 kB)\n",
+      "Collecting tiktoken>=0.6.0 (from vllm)\n",
+      "  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
+      "Collecting lm-format-enforcer<0.11,>=0.10.9 (from vllm)\n",
+      "  Downloading lm_format_enforcer-0.10.9-py3-none-any.whl.metadata (17 kB)\n",
+      "Collecting outlines<0.1,>=0.0.43 (from vllm)\n",
+      "  Downloading outlines-0.0.46-py3-none-any.whl.metadata (15 kB)\n",
+      "Collecting typing-extensions>=4.10 (from vllm)\n",
+      "  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)\n",
+      "Requirement already satisfied: filelock>=3.10.4 in /usr/local/lib/python3.11/dist-packages (from vllm) (3.13.1)\n",
+      "Collecting partial-json-parser (from vllm)\n",
+      "  Downloading partial_json_parser-0.2.1.1.post4-py3-none-any.whl.metadata (6.2 kB)\n",
+      "Requirement already satisfied: pyzmq in /usr/local/lib/python3.11/dist-packages (from vllm) (24.0.1)\n",
+      "Collecting msgspec (from vllm)\n",
+      "  Downloading msgspec-0.18.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)\n",
+      "Collecting gguf==0.10.0 (from vllm)\n",
+      "  Downloading gguf-0.10.0-py3-none-any.whl.metadata (3.5 kB)\n",
+      "Requirement already satisfied: importlib-metadata in /usr/lib/python3/dist-packages (from vllm) (4.6.4)\n",
+      "Collecting mistral-common>=1.5.0 (from mistral-common[opencv]>=1.5.0->vllm)\n",
+      "  Downloading mistral_common-1.5.1-py3-none-any.whl.metadata (4.6 kB)\n",
+      "Requirement already satisfied: pyyaml in /usr/local/lib/python3.11/dist-packages (from vllm) (6.0.2)\n",
+      "Collecting einops (from vllm)\n",
+      "  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)\n",
+      "Collecting compressed-tensors==0.8.0 (from vllm)\n",
+      "  Downloading compressed_tensors-0.8.0-py3-none-any.whl.metadata (6.8 kB)\n",
+      "Collecting ray>=2.9 (from vllm)\n",
+      "  Downloading ray-2.40.0-cp311-cp311-manylinux2014_x86_64.whl.metadata (17 kB)\n",
+      "Collecting nvidia-ml-py>=12.560.30 (from vllm)\n",
+      "  Downloading nvidia_ml_py-12.560.30-py3-none-any.whl.metadata (8.6 kB)\n",
+      "Collecting torch==2.5.1 (from vllm)\n",
+      "  Downloading torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)\n",
+      "Collecting torchvision==0.20.1 (from vllm)\n",
+      "  Downloading torchvision-0.20.1-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)\n",
+      "Collecting xformers==0.0.28.post3 (from vllm)\n",
+      "  Downloading xformers-0.0.28.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)\n",
+      "Collecting fastapi!=0.113.*,!=0.114.0,>=0.107.0 (from vllm)\n",
+      "  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)\n",
+      "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch==2.5.1->vllm) (3.2.1)\n",
+      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch==2.5.1->vllm) (3.1.3)\n",
+      "Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch==2.5.1->vllm) (2024.2.0)\n",
+      "Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n",
+      "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch==2.5.1->vllm) (9.1.0.70)\n",
+      "Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Collecting nvidia-curand-cu12==10.3.5.147 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n",
+      "Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n",
+      "Collecting nvidia-nccl-cu12==2.21.5 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)\n",
+      "Collecting nvidia-nvtx-cu12==12.4.127 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.7 kB)\n",
+      "Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch==2.5.1->vllm)\n",
+      "  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Collecting triton==3.1.0 (from torch==2.5.1->vllm)\n",
+      "  Downloading triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)\n",
+      "Collecting sympy==1.13.1 (from torch==2.5.1->vllm)\n",
+      "  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch==2.5.1->vllm) (1.3.0)\n",
+      "Collecting starlette<0.42.0,>=0.40.0 (from fastapi!=0.113.*,!=0.114.0,>=0.107.0->vllm)\n",
+      "  Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)\n",
+      "Collecting interegular>=0.3.2 (from lm-format-enforcer<0.11,>=0.10.9->vllm)\n",
+      "  Downloading interegular-0.3.3-py37-none-any.whl.metadata (3.0 kB)\n",
+      "Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from lm-format-enforcer<0.11,>=0.10.9->vllm) (24.1)\n",
+      "Requirement already satisfied: jsonschema<5.0.0,>=4.21.1 in /usr/local/lib/python3.11/dist-packages (from mistral-common>=1.5.0->mistral-common[opencv]>=1.5.0->vllm) (4.23.0)\n",
+      "Collecting pillow (from vllm)\n",
+      "  Downloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.2 kB)\n",
+      "Collecting tiktoken>=0.6.0 (from vllm)\n",
+      "  Downloading tiktoken-0.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
+      "Collecting opencv-python-headless<5.0.0,>=4.0.0 (from mistral-common[opencv]>=1.5.0->vllm)\n",
+      "  Downloading opencv_python_headless-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)\n",
+      "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.11/dist-packages (from openai>=1.45.0->vllm) (4.6.0)\n",
+      "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai>=1.45.0->vllm) (1.7.0)\n",
+      "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from openai>=1.45.0->vllm) (0.27.2)\n",
+      "Collecting jiter<1,>=0.4.0 (from openai>=1.45.0->vllm)\n",
+      "  Downloading jiter-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)\n",
+      "Requirement already satisfied: sniffio in /usr/local/lib/python3.11/dist-packages (from openai>=1.45.0->vllm) (1.3.1)\n",
+      "Collecting lark (from outlines<0.1,>=0.0.43->vllm)\n",
+      "  Downloading lark-1.2.2-py3-none-any.whl.metadata (1.8 kB)\n",
+      "Requirement already satisfied: nest-asyncio in /usr/local/lib/python3.11/dist-packages (from outlines<0.1,>=0.0.43->vllm) (1.6.0)\n",
+      "Collecting cloudpickle (from outlines<0.1,>=0.0.43->vllm)\n",
+      "  Downloading cloudpickle-3.1.0-py3-none-any.whl.metadata (7.0 kB)\n",
+      "Collecting diskcache (from outlines<0.1,>=0.0.43->vllm)\n",
+      "  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)\n",
+      "Collecting numba (from outlines<0.1,>=0.0.43->vllm)\n",
+      "  Downloading numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)\n",
+      "Requirement already satisfied: referencing in /usr/local/lib/python3.11/dist-packages (from outlines<0.1,>=0.0.43->vllm) (0.35.1)\n",
+      "Requirement already satisfied: datasets in /usr/local/lib/python3.11/dist-packages (from outlines<0.1,>=0.0.43->vllm) (3.1.0)\n",
+      "Collecting pycountry (from outlines<0.1,>=0.0.43->vllm)\n",
+      "  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)\n",
+      "Collecting pyairports (from outlines<0.1,>=0.0.43->vllm)\n",
+      "  Downloading pyairports-2.1.1-py3-none-any.whl.metadata (1.7 kB)\n",
+      "Collecting annotated-types>=0.6.0 (from pydantic>=2.9->vllm)\n",
+      "  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)\n",
+      "Collecting pydantic-core==2.27.1 (from pydantic>=2.9->vllm)\n",
+      "  Downloading pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
+      "Collecting click>=7.0 (from ray>=2.9->vllm)\n",
+      "  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)\n",
+      "Collecting msgpack<2.0.0,>=1.0.0 (from ray>=2.9->vllm)\n",
+      "  Downloading msgpack-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)\n",
+      "Requirement already satisfied: aiosignal in /usr/local/lib/python3.11/dist-packages (from ray>=2.9->vllm) (1.3.1)\n",
+      "Requirement already satisfied: frozenlist in /usr/local/lib/python3.11/dist-packages (from ray>=2.9->vllm) (1.5.0)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests>=2.26.0->vllm) (3.3.2)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests>=2.26.0->vllm) (3.10)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests>=2.26.0->vllm) (2.2.3)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests>=2.26.0->vllm) (2024.8.30)\n",
+      "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.11/dist-packages (from tiktoken>=0.6.0->vllm) (2024.11.6)\n",
+      "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.11/dist-packages (from tokenizers>=0.19.1->vllm) (0.26.3)\n",
+      "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from transformers>=4.45.2->vllm) (0.4.5)\n",
+      "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->vllm) (2.4.3)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->vllm) (24.2.0)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp->vllm) (6.1.0)\n",
+      "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->vllm) (0.2.0)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->vllm) (1.18.0)\n",
+      "Requirement already satisfied: h11>=0.8 in /usr/local/lib/python3.11/dist-packages (from uvicorn[standard]->vllm) (0.14.0)\n",
+      "Collecting httptools>=0.6.3 (from uvicorn[standard]->vllm)\n",
+      "  Downloading httptools-0.6.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)\n",
+      "Collecting python-dotenv>=0.13 (from uvicorn[standard]->vllm)\n",
+      "  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)\n",
+      "Collecting uvloop!=0.15.0,!=0.15.1,>=0.14.0 (from uvicorn[standard]->vllm)\n",
+      "  Downloading uvloop-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)\n",
+      "Collecting watchfiles>=0.13 (from uvicorn[standard]->vllm)\n",
+      "  Downloading watchfiles-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)\n",
+      "Collecting websockets>=10.4 (from uvicorn[standard]->vllm)\n",
+      "  Downloading websockets-14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
+      "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx<1,>=0.23.0->openai>=1.45.0->vllm) (1.0.5)\n",
+      "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.11/dist-packages (from jsonschema<5.0.0,>=4.21.1->mistral-common>=1.5.0->mistral-common[opencv]>=1.5.0->vllm) (2023.12.1)\n",
+      "Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.11/dist-packages (from jsonschema<5.0.0,>=4.21.1->mistral-common>=1.5.0->mistral-common[opencv]>=1.5.0->vllm) (0.20.0)\n",
+      "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.11/dist-packages (from datasets->outlines<0.1,>=0.0.43->vllm) (18.1.0)\n",
+      "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.11/dist-packages (from datasets->outlines<0.1,>=0.0.43->vllm) (0.3.8)\n",
+      "Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from datasets->outlines<0.1,>=0.0.43->vllm) (2.2.3)\n",
+      "Requirement already satisfied: xxhash in /usr/local/lib/python3.11/dist-packages (from datasets->outlines<0.1,>=0.0.43->vllm) (3.5.0)\n",
+      "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.11/dist-packages (from datasets->outlines<0.1,>=0.0.43->vllm) (0.70.16)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch==2.5.1->vllm) (2.1.5)\n",
+      "Collecting llvmlite<0.44,>=0.43.0dev0 (from numba->outlines<0.1,>=0.0.43->vllm)\n",
+      "  Downloading llvmlite-0.43.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets->outlines<0.1,>=0.0.43->vllm) (2.9.0.post0)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets->outlines<0.1,>=0.0.43->vllm) (2024.2)\n",
+      "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets->outlines<0.1,>=0.0.43->vllm) (2024.2)\n",
+      "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas->datasets->outlines<0.1,>=0.0.43->vllm) (1.16.0)\n",
+      "Downloading vllm-0.6.4.post1-cp38-abi3-manylinux1_x86_64.whl (198.9 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m198.9/198.9 MB\u001b[0m \u001b[31m117.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading compressed_tensors-0.8.0-py3-none-any.whl (86 kB)\n",
+      "Downloading gguf-0.10.0-py3-none-any.whl (71 kB)\n",
+      "Downloading torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl (906.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m906.5/906.5 MB\u001b[0m \u001b[31m85.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading torchvision-0.20.1-cp311-cp311-manylinux1_x86_64.whl (7.2 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m111.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading xformers-0.0.28.post3-cp311-cp311-manylinux_2_28_x86_64.whl (16.7 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.7/16.7 MB\u001b[0m \u001b[31m105.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m178.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m61.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m85.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m116.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl (211.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m152.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl (56.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m127.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl (127.9 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m145.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl (207.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m134.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl (188.7 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m188.7/188.7 MB\u001b[0m \u001b[31m142.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m151.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (99 kB)\n",
+      "Downloading sympy-1.13.1-py3-none-any.whl (6.2 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m161.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.5/209.5 MB\u001b[0m \u001b[31m88.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading fastapi-0.115.6-py3-none-any.whl (94 kB)\n",
+      "Downloading lm_format_enforcer-0.10.9-py3-none-any.whl (43 kB)\n",
+      "Downloading mistral_common-1.5.1-py3-none-any.whl (6.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.5/6.5 MB\u001b[0m \u001b[31m167.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m146.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading nvidia_ml_py-12.560.30-py3-none-any.whl (40 kB)\n",
+      "Downloading openai-1.57.0-py3-none-any.whl (389 kB)\n",
+      "Downloading outlines-0.0.46-py3-none-any.whl (101 kB)\n",
+      "Downloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl (4.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m148.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading prometheus_fastapi_instrumentator-7.0.0-py3-none-any.whl (19 kB)\n",
+      "Downloading pydantic-2.10.3-py3-none-any.whl (456 kB)\n",
+      "Downloading pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m180.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading ray-2.40.0-cp311-cp311-manylinux2014_x86_64.whl (67.0 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.0/67.0 MB\u001b[0m \u001b[31m151.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading protobuf-5.29.1-cp38-abi3-manylinux2014_x86_64.whl (319 kB)\n",
+      "Downloading tiktoken-0.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m153.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading typing_extensions-4.12.2-py3-none-any.whl (37 kB)\n",
+      "Downloading einops-0.8.0-py3-none-any.whl (43 kB)\n",
+      "Downloading msgspec-0.18.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209 kB)\n",
+      "Downloading partial_json_parser-0.2.1.1.post4-py3-none-any.whl (9.9 kB)\n",
+      "Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)\n",
+      "Downloading annotated_types-0.7.0-py3-none-any.whl (13 kB)\n",
+      "Downloading click-8.1.7-py3-none-any.whl (97 kB)\n",
+      "Downloading httptools-0.6.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (459 kB)\n",
+      "Downloading interegular-0.3.3-py37-none-any.whl (23 kB)\n",
+      "Downloading jiter-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (343 kB)\n",
+      "Downloading msgpack-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (403 kB)\n",
+      "Downloading opencv_python_headless-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.9 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.9/49.9 MB\u001b[0m \u001b[31m117.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)\n",
+      "Downloading starlette-0.41.3-py3-none-any.whl (73 kB)\n",
+      "Downloading uvloop-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.0 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.0/4.0 MB\u001b[0m \u001b[31m160.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading watchfiles-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (442 kB)\n",
+      "Downloading websockets-14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (168 kB)\n",
+      "Downloading cloudpickle-3.1.0-py3-none-any.whl (22 kB)\n",
+      "Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)\n",
+      "Downloading lark-1.2.2-py3-none-any.whl (111 kB)\n",
+      "Downloading numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.7 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.7/3.7 MB\u001b[0m \u001b[31m132.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading pyairports-2.1.1-py3-none-any.whl (371 kB)\n",
+      "Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.3/6.3 MB\u001b[0m \u001b[31m130.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading uvicorn-0.32.1-py3-none-any.whl (63 kB)\n",
+      "Downloading llvmlite-0.43.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (43.9 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.9/43.9 MB\u001b[0m \u001b[31m168.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hInstalling collected packages: sentencepiece, pyairports, py-cpuinfo, nvidia-ml-py, websockets, uvloop, typing-extensions, triton, sympy, python-dotenv, pycountry, protobuf, pillow, partial-json-parser, opencv-python-headless, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, msgspec, msgpack, llvmlite, lark, jiter, interegular, httptools, gguf, einops, diskcache, cloudpickle, click, annotated-types, watchfiles, uvicorn, tiktoken, starlette, pydantic-core, nvidia-cusparse-cu12, numba, pydantic, prometheus-fastapi-instrumentator, nvidia-cusolver-cu12, torch, ray, openai, mistral-common, lm-format-enforcer, fastapi, xformers, torchvision, outlines, compressed-tensors, vllm\n",
+      "  Attempting uninstall: typing-extensions\n",
+      "    Found existing installation: typing_extensions 4.9.0\n",
+      "    Uninstalling typing_extensions-4.9.0:\n",
+      "      Successfully uninstalled typing_extensions-4.9.0\n",
+      "  Attempting uninstall: triton\n",
+      "    Found existing installation: triton 3.0.0\n",
+      "    Uninstalling triton-3.0.0:\n",
+      "      Successfully uninstalled triton-3.0.0\n",
+      "  Attempting uninstall: sympy\n",
+      "    Found existing installation: sympy 1.12\n",
+      "    Uninstalling sympy-1.12:\n",
+      "      Successfully uninstalled sympy-1.12\n",
+      "  Attempting uninstall: pillow\n",
+      "    Found existing installation: pillow 10.2.0\n",
+      "    Uninstalling pillow-10.2.0:\n",
+      "      Successfully uninstalled pillow-10.2.0\n",
+      "  Attempting uninstall: nvidia-nvtx-cu12\n",
+      "    Found existing installation: nvidia-nvtx-cu12 12.4.99\n",
+      "    Uninstalling nvidia-nvtx-cu12-12.4.99:\n",
+      "      Successfully uninstalled nvidia-nvtx-cu12-12.4.99\n",
+      "  Attempting uninstall: nvidia-nvjitlink-cu12\n",
+      "    Found existing installation: nvidia-nvjitlink-cu12 12.4.99\n",
+      "    Uninstalling nvidia-nvjitlink-cu12-12.4.99:\n",
+      "      Successfully uninstalled nvidia-nvjitlink-cu12-12.4.99\n",
+      "  Attempting uninstall: nvidia-nccl-cu12\n",
+      "    Found existing installation: nvidia-nccl-cu12 2.20.5\n",
+      "    Uninstalling nvidia-nccl-cu12-2.20.5:\n",
+      "      Successfully uninstalled nvidia-nccl-cu12-2.20.5\n",
+      "  Attempting uninstall: nvidia-curand-cu12\n",
+      "    Found existing installation: nvidia-curand-cu12 10.3.5.119\n",
+      "    Uninstalling nvidia-curand-cu12-10.3.5.119:\n",
+      "      Successfully uninstalled nvidia-curand-cu12-10.3.5.119\n",
+      "  Attempting uninstall: nvidia-cufft-cu12\n",
+      "    Found existing installation: nvidia-cufft-cu12 11.2.0.44\n",
+      "    Uninstalling nvidia-cufft-cu12-11.2.0.44:\n",
+      "      Successfully uninstalled nvidia-cufft-cu12-11.2.0.44\n",
+      "  Attempting uninstall: nvidia-cuda-runtime-cu12\n",
+      "    Found existing installation: nvidia-cuda-runtime-cu12 12.4.99\n",
+      "    Uninstalling nvidia-cuda-runtime-cu12-12.4.99:\n",
+      "      Successfully uninstalled nvidia-cuda-runtime-cu12-12.4.99\n",
+      "  Attempting uninstall: nvidia-cuda-nvrtc-cu12\n",
+      "    Found existing installation: nvidia-cuda-nvrtc-cu12 12.4.99\n",
+      "    Uninstalling nvidia-cuda-nvrtc-cu12-12.4.99:\n",
+      "      Successfully uninstalled nvidia-cuda-nvrtc-cu12-12.4.99\n",
+      "  Attempting uninstall: nvidia-cuda-cupti-cu12\n",
+      "    Found existing installation: nvidia-cuda-cupti-cu12 12.4.99\n",
+      "    Uninstalling nvidia-cuda-cupti-cu12-12.4.99:\n",
+      "      Successfully uninstalled nvidia-cuda-cupti-cu12-12.4.99\n",
+      "  Attempting uninstall: nvidia-cublas-cu12\n",
+      "    Found existing installation: nvidia-cublas-cu12 12.4.2.65\n",
+      "    Uninstalling nvidia-cublas-cu12-12.4.2.65:\n",
+      "      Successfully uninstalled nvidia-cublas-cu12-12.4.2.65\n",
+      "  Attempting uninstall: nvidia-cusparse-cu12\n",
+      "    Found existing installation: nvidia-cusparse-cu12 12.3.0.142\n",
+      "    Uninstalling nvidia-cusparse-cu12-12.3.0.142:\n",
+      "      Successfully uninstalled nvidia-cusparse-cu12-12.3.0.142\n",
+      "  Attempting uninstall: nvidia-cusolver-cu12\n",
+      "    Found existing installation: nvidia-cusolver-cu12 11.6.0.99\n",
+      "    Uninstalling nvidia-cusolver-cu12-11.6.0.99:\n",
+      "      Successfully uninstalled nvidia-cusolver-cu12-11.6.0.99\n",
+      "  Attempting uninstall: torch\n",
+      "    Found existing installation: torch 2.4.1+cu124\n",
+      "    Uninstalling torch-2.4.1+cu124:\n",
+      "      Successfully uninstalled torch-2.4.1+cu124\n",
+      "  Attempting uninstall: torchvision\n",
+      "    Found existing installation: torchvision 0.19.1+cu124\n",
+      "    Uninstalling torchvision-0.19.1+cu124:\n",
+      "      Successfully uninstalled torchvision-0.19.1+cu124\n",
+      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+      "torchaudio 2.4.1+cu124 requires torch==2.4.1, but you have torch 2.5.1 which is incompatible.\u001b[0m\u001b[31m\n",
+      "\u001b[0mSuccessfully installed annotated-types-0.7.0 click-8.1.7 cloudpickle-3.1.0 compressed-tensors-0.8.0 diskcache-5.6.3 einops-0.8.0 fastapi-0.115.6 gguf-0.10.0 httptools-0.6.4 interegular-0.3.3 jiter-0.8.0 lark-1.2.2 llvmlite-0.43.0 lm-format-enforcer-0.10.9 mistral-common-1.5.1 msgpack-1.1.0 msgspec-0.18.6 numba-0.60.0 nvidia-cublas-cu12-12.4.5.8 nvidia-cuda-cupti-cu12-12.4.127 nvidia-cuda-nvrtc-cu12-12.4.127 nvidia-cuda-runtime-cu12-12.4.127 nvidia-cufft-cu12-11.2.1.3 nvidia-curand-cu12-10.3.5.147 nvidia-cusolver-cu12-11.6.1.9 nvidia-cusparse-cu12-12.3.1.170 nvidia-ml-py-12.560.30 nvidia-nccl-cu12-2.21.5 nvidia-nvjitlink-cu12-12.4.127 nvidia-nvtx-cu12-12.4.127 openai-1.57.0 opencv-python-headless-4.10.0.84 outlines-0.0.46 partial-json-parser-0.2.1.1.post4 pillow-10.4.0 prometheus-fastapi-instrumentator-7.0.0 protobuf-5.29.1 py-cpuinfo-9.0.0 pyairports-2.1.1 pycountry-24.6.1 pydantic-2.10.3 pydantic-core-2.27.1 python-dotenv-1.0.1 ray-2.40.0 sentencepiece-0.2.0 starlette-0.41.3 sympy-1.13.1 tiktoken-0.7.0 torch-2.5.1 torchvision-0.20.1 triton-3.1.0 typing-extensions-4.12.2 uvicorn-0.32.1 uvloop-0.21.0 vllm-0.6.4.post1 watchfiles-1.0.0 websockets-14.1 xformers-0.0.28.post3\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n",
+      "\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install vllm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e772542e-467c-481a-9128-8364987a1bd9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sun Dec  8 01:39:25 2024       \n",
+      "+-----------------------------------------------------------------------------------------+\n",
+      "| NVIDIA-SMI 565.57.01              Driver Version: 565.57.01      CUDA Version: 12.7     |\n",
+      "|-----------------------------------------+------------------------+----------------------+\n",
+      "| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |\n",
+      "| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\n",
+      "|                                         |                        |               MIG M. |\n",
+      "|=========================================+========================+======================|\n",
+      "|   0  NVIDIA H100 NVL                On  |   00000000:3C:00.0 Off |                    0 |\n",
+      "| N/A   26C    P0             62W /  310W |       1MiB /  95830MiB |      0%      Default |\n",
+      "|                                         |                        |             Disabled |\n",
+      "+-----------------------------------------+------------------------+----------------------+\n",
+      "|   1  NVIDIA H100 NVL                On  |   00000000:AE:00.0 Off |                    0 |\n",
+      "| N/A   26C    P0             59W /  310W |       1MiB /  95830MiB |      0%      Default |\n",
+      "|                                         |                        |             Disabled |\n",
+      "+-----------------------------------------+------------------------+----------------------+\n",
+      "|   2  NVIDIA H100 NVL                On  |   00000000:BD:00.0 Off |                    0 |\n",
+      "| N/A   24C    P0             60W /  310W |       1MiB /  95830MiB |      0%      Default |\n",
+      "|                                         |                        |             Disabled |\n",
+      "+-----------------------------------------+------------------------+----------------------+\n",
+      "|   3  NVIDIA H100 NVL                On  |   00000000:BE:00.0 Off |                    0 |\n",
+      "| N/A   26C    P0             60W /  310W |       1MiB /  95830MiB |      0%      Default |\n",
+      "|                                         |                        |             Disabled |\n",
+      "+-----------------------------------------+------------------------+----------------------+\n",
+      "                                                                                         \n",
+      "+-----------------------------------------------------------------------------------------+\n",
+      "| Processes:                                                                              |\n",
+      "|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |\n",
+      "|        ID   ID                                                               Usage      |\n",
+      "|=========================================================================================|\n",
+      "|  No running processes found                                                             |\n",
+      "+-----------------------------------------------------------------------------------------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2bf7e331-4686-4c0d-ae0f-72cbb79e2e8c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vllm import LLM, SamplingParams"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "a51d52bc-d60e-412e-a150-20bc0526d20e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7c598964dfdb4818aad022b3d085af8d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 12-08 01:39:53 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.\n",
+      "INFO 12-08 01:39:53 awq_marlin.py:113] Detected that the model can run with awq_marlin, however you specified quantization=awq explicitly, so forcing awq. Use quantization=awq_marlin for faster inference\n",
+      "WARNING 12-08 01:39:53 config.py:428] awq quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n",
+      "INFO 12-08 01:39:53 config.py:1020] Defaulting to use mp for distributed inference\n",
+      "WARNING 12-08 01:39:53 arg_utils.py:1013] Chunked prefill is enabled by default for models with max_model_len > 32K. Currently, chunked prefill might not work with some features or models. If you encounter any issues, please disable chunked prefill by setting --enable-chunked-prefill=False.\n",
+      "INFO 12-08 01:39:53 config.py:1136] Chunked prefill is enabled with max_num_batched_tokens=512.\n",
+      "INFO 12-08 01:39:53 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='kishizaki-sci/Llama-3.1-405B-Instruct-AWQ-4bit-JP-EN', speculative_config=None, tokenizer='kishizaki-sci/Llama-3.1-405B-Instruct-AWQ-4bit-JP-EN', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=kishizaki-sci/Llama-3.1-405B-Instruct-AWQ-4bit-JP-EN, num_scheduler_steps=1, chunked_prefill_enabled=True multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, chat_template_text_format=string, mm_processor_kwargs=None, pooler_config=None)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1f7cc9b8b1b54e97a7638c1fdf2ddcaf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ef6601b681a24fd9a2208a04352df88a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "06e8fb444a9847878c56f24e9856c9e2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7bef29f785a341a5bada54897d06284e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "generation_config.json:   0%|          | 0.00/182 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING 12-08 01:39:57 multiproc_gpu_executor.py:56] Reducing Torch parallelism from 72 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.\n",
+      "INFO 12-08 01:39:57 custom_cache_manager.py:17] Setting Triton cache manager to: vllm.triton_utils.custom_cache_manager:CustomCacheManager\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=729)\u001b[0;0m INFO 12-08 01:39:57 multiproc_worker_utils.py:215] Worker ready; awaiting tasks\n",
+      "INFO 12-08 01:39:57 selector.py:135] Using Flash Attention backend.\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=729)\u001b[0;0m \u001b[1;36m(VllmWorkerProcess pid=730)\u001b[0;0m INFO 12-08 01:39:57 selector.py:135] Using Flash Attention backend.\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=730)\u001b[0;0m INFO 12-08 01:39:57 multiproc_worker_utils.py:215] Worker ready; awaiting tasks\n",
+      "INFO 12-08 01:39:57 selector.py:135] Using Flash Attention backend.\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=731)\u001b[0;0m INFO 12-08 01:39:57 selector.py:135] Using Flash Attention backend.\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=731)\u001b[0;0m INFO 12-08 01:39:57 multiproc_worker_utils.py:215] Worker ready; awaiting tasks\n",
+      "INFO 12-08 01:40:00 utils.py:961] Found nccl from library libnccl.so.2\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=730)\u001b[0;0m INFO 12-08 01:40:00 pynccl.py:69] vLLM is using nccl==2.21.5\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=729)\u001b[0;0m \u001b[1;36m(VllmWorkerProcess pid=731)\u001b[0;0m INFO 12-08 01:40:00 utils.py:961] Found nccl from library libnccl.so.2\n",
+      "INFO 12-08 01:40:00 utils.py:961] Found nccl from library libnccl.so.2\n",
+      "INFO 12-08 01:40:00 utils.py:961] Found nccl from library libnccl.so.2\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=729)\u001b[0;0m \u001b[1;36m(VllmWorkerProcess pid=730)\u001b[0;0m INFO 12-08 01:40:00 pynccl.py:69] vLLM is using nccl==2.21.5\n",
+      "INFO 12-08 01:40:00 pynccl.py:69] vLLM is using nccl==2.21.5\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=731)\u001b[0;0m INFO 12-08 01:40:00 pynccl.py:69] vLLM is using nccl==2.21.5\n",
+      "WARNING 12-08 01:40:01 custom_all_reduce.py:134] Custom allreduce is disabled because it's not supported on more than two PCIe-only GPUs. To silence this warning, specify disable_custom_all_reduce=True explicitly.\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=729)\u001b[0;0m \u001b[1;36m(VllmWorkerProcess pid=730)\u001b[0;0m WARNING 12-08 01:40:01 custom_all_reduce.py:134] Custom allreduce is disabled because it's not supported on more than two PCIe-only GPUs. To silence this warning, specify disable_custom_all_reduce=True explicitly.\n",
+      "WARNING 12-08 01:40:01 custom_all_reduce.py:134] Custom allreduce is disabled because it's not supported on more than two PCIe-only GPUs. To silence this warning, specify disable_custom_all_reduce=True explicitly.\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=731)\u001b[0;0m WARNING 12-08 01:40:01 custom_all_reduce.py:134] Custom allreduce is disabled because it's not supported on more than two PCIe-only GPUs. To silence this warning, specify disable_custom_all_reduce=True explicitly.\n",
+      "INFO 12-08 01:40:01 shm_broadcast.py:236] vLLM message queue communication handle: Handle(connect_ip='127.0.0.1', local_reader_ranks=[1, 2, 3], buffer=<vllm.distributed.device_communicators.shm_broadcast.ShmRingBuffer object at 0x7fe462e95610>, local_subscribe_port=36659, remote_subscribe_port=None)\n",
+      "INFO 12-08 01:40:01 model_runner.py:1072] Starting to load model kishizaki-sci/Llama-3.1-405B-Instruct-AWQ-4bit-JP-EN...\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=729)\u001b[0;0m \u001b[1;36m(VllmWorkerProcess pid=730)\u001b[0;0m \u001b[1;36m(VllmWorkerProcess pid=731)\u001b[0;0m INFO 12-08 01:40:01 model_runner.py:1072] Starting to load model kishizaki-sci/Llama-3.1-405B-Instruct-AWQ-4bit-JP-EN...\n",
+      "INFO 12-08 01:40:01 model_runner.py:1072] Starting to load model kishizaki-sci/Llama-3.1-405B-Instruct-AWQ-4bit-JP-EN...\n",
+      "INFO 12-08 01:40:01 model_runner.py:1072] Starting to load model kishizaki-sci/Llama-3.1-405B-Instruct-AWQ-4bit-JP-EN...\n",
+      "INFO 12-08 01:40:02 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=729)\u001b[0;0m INFO 12-08 01:40:02 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=730)\u001b[0;0m INFO 12-08 01:40:02 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=731)\u001b[0;0m INFO 12-08 01:40:02 weight_utils.py:243] Using model weights format ['*.safetensors']\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3992e4d9fca34515910b7bf2492a3bc2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00004-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3e0eec98a2b1418283e0809b0cf9b7ad",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00002-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "aef1bf777f134def9e8fdbf6038e9b0f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00006-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b506d6aed0024cde8f18df0a3e796fbc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00007-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "db9659785e1b49bdbc77298f8d95a746",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00008-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4ac31302b8d94e48996d65295561ea38",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00003-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "34494082039e41de94d38168f54e07b4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00005-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "71affa983f584b91b98bddedbc7f894e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00001-of-00044.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2e9c94b7b0bc43d8b22785c62d09e264",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00009-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2fddbf414af2494692fc262abf2832b7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00010-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6e72305ba5cf40269479f81dbb25b69e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00011-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "023fe209088a42a28e7656c3da74343d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00012-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e20f953f68274473aaba3a3fe9d86591",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00013-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "70928ce19e7b4aa38313d2369fe0280f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00014-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e9f0748b3b954b748cea03d9d7973fd5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00015-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "56a827b4296d4d79a34ff5b095ca7532",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00016-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6bb5e0bd0d09458cb21c84f8c8cb2174",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00017-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d423b03ac7b84064b9fc2c0c86794e31",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00018-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "79956a438155481080f30b669d7aae7b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00019-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cc538ae927d2423d8bdf28c437522018",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00020-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "01d605b47f224b81a1d78c46bc512440",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00021-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e3a40b4984db442b863fb24b9510364b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00022-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ccdb98bdf695414b832cd7e7c28c042f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00023-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3480c970522041caa1de260c97f3a5aa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00024-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "135cdeaff5be480ca496d0562acf83cd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00025-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d0b44e898f064aa7b43ea8b4d36e97ad",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00026-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1aa50a58715943caa10db97b0ee5981f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00027-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e9de0a30c1534f499465e270e157e9de",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00028-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "445f10e616fd49e1a4868c88fb31c56f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00029-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "292d0fbfbf8c4145b3a208480eeb6121",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00030-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "43893486f5dc40f290811b94d4c1352d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00031-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7291f54de4874b9ebbd2c215a1b5c5ed",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00032-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "461b39ab50e746a1bd34e8f641d75f15",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00033-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e98726876f774a92a5a4d5b3bc2b381a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00034-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d669bf2bd9d04dc881d16f5eb955f599",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00035-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1443fa509c8b4c2a8acaeb0c7bbc3f84",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00036-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9f388beeb6f245568782cf29d8a7b831",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00037-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c6a2ce5c1e564152802f56ec6e4e8a75",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00038-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2335b6093c9f49eeba58349209fbc38d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00039-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a7b3687060db425ab597c8e103deed46",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00040-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2b561cea2d324c40975c1eb9ea30cc5d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00041-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b287241358cf48a497101402d0bd693f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00042-of-00044.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cc93269313fc4f7f9d7d1a301373a2f8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00043-of-00044.safetensors:   0%|          | 0.00/4.22G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7fd37422efe449fbb6ac31fd869cee15",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00044-of-00044.safetensors:   0%|          | 0.00/4.20G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a664a2e9587343f99883157151ba080f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors.index.json:   0%|          | 0.00/239k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8e2b847d11014313807e518426b573a4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading safetensors checkpoint shards:   0% Completed | 0/44 [00:00<?, ?it/s]\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 12-08 01:52:50 model_runner.py:1077] Loading model weights took 50.6331 GB\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=730)\u001b[0;0m INFO 12-08 01:52:52 model_runner.py:1077] Loading model weights took 50.6331 GB\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=729)\u001b[0;0m INFO 12-08 01:52:52 model_runner.py:1077] Loading model weights took 50.6331 GB\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=731)\u001b[0;0m INFO 12-08 01:52:52 model_runner.py:1077] Loading model weights took 50.6331 GB\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=730)\u001b[0;0m \u001b[1;36m(VllmWorkerProcess pid=729)\u001b[0;0m \u001b[1;36m(VllmWorkerProcess pid=731)\u001b[0;0m INFO 12-08 01:52:54 worker.py:232] Memory profiling results: total_gpu_memory=93.11GiB initial_memory_usage=51.58GiB peak_torch_memory=51.55GiB memory_usage_post_profile=51.82GiB non_torch_memory=1.15GiB kv_cache_size=37.61GiB gpu_memory_utilization=0.97\n",
+      "INFO 12-08 01:52:54 worker.py:232] Memory profiling results: total_gpu_memory=93.11GiB initial_memory_usage=51.51GiB peak_torch_memory=51.55GiB memory_usage_post_profile=51.68GiB non_torch_memory=1.01GiB kv_cache_size=37.75GiB gpu_memory_utilization=0.97\n",
+      "INFO 12-08 01:52:54 worker.py:232] Memory profiling results: total_gpu_memory=93.11GiB initial_memory_usage=51.58GiB peak_torch_memory=51.55GiB memory_usage_post_profile=51.82GiB non_torch_memory=1.15GiB kv_cache_size=37.61GiB gpu_memory_utilization=0.97\n",
+      "INFO 12-08 01:52:54 worker.py:232] Memory profiling results: total_gpu_memory=93.11GiB initial_memory_usage=51.51GiB peak_torch_memory=51.84GiB memory_usage_post_profile=51.68GiB non_torch_memory=1.02GiB kv_cache_size=37.46GiB gpu_memory_utilization=0.97\n",
+      "INFO 12-08 01:52:54 distributed_gpu_executor.py:57] # GPU blocks: 19483, # CPU blocks: 2080\n",
+      "INFO 12-08 01:52:54 distributed_gpu_executor.py:61] Maximum concurrency for 131072 tokens per request: 2.38x\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=730)\u001b[0;0m INFO 12-08 01:52:59 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=730)\u001b[0;0m INFO 12-08 01:52:59 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
+      "INFO 12-08 01:52:59 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
+      "INFO 12-08 01:52:59 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=731)\u001b[0;0m INFO 12-08 01:53:00 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=731)\u001b[0;0m INFO 12-08 01:53:00 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=729)\u001b[0;0m INFO 12-08 01:53:00 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=729)\u001b[0;0m INFO 12-08 01:53:00 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=731)\u001b[0;0m INFO 12-08 01:53:44 model_runner.py:1518] Graph capturing finished in 45 secs, took 2.71 GiB\n",
+      "INFO 12-08 01:53:45 model_runner.py:1518] Graph capturing finished in 46 secs, took 2.71 GiB\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=729)\u001b[0;0m INFO 12-08 01:53:45 model_runner.py:1518] Graph capturing finished in 45 secs, took 2.71 GiB\n",
+      "\u001b[1;36m(VllmWorkerProcess pid=730)\u001b[0;0m INFO 12-08 01:53:45 model_runner.py:1518] Graph capturing finished in 46 secs, took 2.71 GiB\n"
+     ]
+    }
+   ],
+   "source": [
+    "llm = LLM(\n",
+    "    model=\"kishizaki-sci/Llama-3.1-405B-Instruct-AWQ-4bit-JP-EN\",\n",
+    "    tensor_parallel_size=4,\n",
+    "    gpu_memory_utilization=0.97,\n",
+    "    quantization=\"awq\"\n",
+    ")\n",
+    "tokenizer = llm.get_tokenizer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "cc81f387-a06f-4564-a50e-37e367a79422",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DEFAULT_SYSTEM_PROMPT = \"あなたは日本人のアシスタントです。\"\n",
+    "text = \"plotly.graph_objectsを使って散布図を作るサンプルコードを書いてください．\"\n",
+    "\n",
+    "messages = [\n",
+    "    {\"role\": \"system\", \"content\": DEFAULT_SYSTEM_PROMPT},\n",
+    "    {\"role\": \"user\", \"content\": text},\n",
+    "]\n",
+    "\n",
+    "prompt = tokenizer.apply_chat_template(\n",
+    "    messages,\n",
+    "    tokenize=False,\n",
+    "    add_generation_prompt=True\n",
+    ")\n",
+    "\n",
+    "sampling_params = SamplingParams(\n",
+    "    temperature=0.6,\n",
+    "    top_p=0.9,\n",
+    "    max_tokens=1000\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "c74b2d83-12ff-4324-bc84-51e88b3e12b3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processed prompts: 100%|██████████| 1/1 [00:20<00:00, 20.38s/it, est. speed input: 3.29 toks/s, output: 13.59 toks/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "plotly.graph_objectsを使って散布図を作るサンプルコードを以下に示します。\n",
+      "\n",
+      "```python\n",
+      "import plotly.graph_objects as go\n",
+      "import numpy as np\n",
+      "\n",
+      "# サンプルデータを生成\n",
+      "np.random.seed(0)\n",
+      "x = np.random.randn(100)\n",
+      "y = np.random.randn(100)\n",
+      "\n",
+      "# 散布図を作成\n",
+      "fig = go.Figure(data=[go.Scatter(\n",
+      "    x=x,\n",
+      "    y=y,\n",
+      "    mode='markers',\n",
+      "    marker=dict(\n",
+      "        size=10,\n",
+      "        color='blue',\n",
+      "        opacity=0.7\n",
+      "    )\n",
+      ")])\n",
+      "\n",
+      "# グラフのタイトルと軸ラベルを設定\n",
+      "fig.update_layout(\n",
+      "    title='散布図のサンプル',\n",
+      "    xaxis_title='X軸',\n",
+      "    yaxis_title='Y軸'\n",
+      ")\n",
+      "\n",
+      "# グラフを表示\n",
+      "fig.show()\n",
+      "```\n",
+      "\n",
+      "このコードでは、numpyを使用してランダムなサンプルデータを生成し、plotly.graph_objectsのScatterオブジェクトを使用して散布図を作成しています。散布図のマーカーのサイズ、色、透明度を設定し、���ラフのタイトルと軸ラベルを設定しています。最後に、`fig.show()`を使用してグラフを表示しています。\n",
+      "CPU times: user 19.8 s, sys: 645 ms, total: 20.5 s\n",
+      "Wall time: 20.4 s\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "outputs = llm.generate(prompt, sampling_params)\n",
+    "print(outputs[0].outputs[0].text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fb4a3d0-10ba-4eda-824d-e774322ddf07",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}