Spaces:
Runtime error
Runtime error
# Benchmark | |
name: Benchmark | |
on: | |
workflow_dispatch: | |
inputs: | |
gpu-series: | |
description: 'Azure GPU series to run with' | |
required: true | |
type: choice | |
options: | |
- Standard_NC4as_T4_v3 | |
- Standard_NC24ads_A100_v4 | |
- Standard_NC80adis_H100_v5 | |
sha: | |
description: 'Commit SHA1 to build' | |
required: false | |
type: string | |
duration: | |
description: 'Duration of the bench' | |
type: string | |
default: 10m | |
push: | |
branches: | |
- master | |
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] | |
pull_request_target: | |
types: [opened, synchronize, reopened] | |
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] | |
schedule: | |
- cron: '04 2 * * *' | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }} | |
cancel-in-progress: true | |
jobs: | |
bench-server-baseline: | |
runs-on: Standard_NC4as_T4_v3 | |
env: | |
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it | |
N_USERS: 8 | |
DURATION: 10m | |
strategy: | |
matrix: | |
model: [phi-2] | |
ftype: [q4_0, q8_0, f16] | |
include: | |
- model: phi-2 | |
ftype: q4_0 | |
pr_comment_enabled: "true" | |
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }} | |
steps: | |
- name: Clone | |
id: checkout | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 0 | |
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} | |
- name: Install python env | |
id: pipenv | |
run: | | |
cd examples/server/bench | |
python3 -m venv venv | |
source venv/bin/activate | |
pip install -r requirements.txt | |
- name: Prometheus | |
id: install_prometheus | |
run: | | |
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz | |
tar xzf prometheus*.tar.gz --strip-components=1 | |
./prometheus --config.file=examples/server/bench/prometheus.yml & | |
while ! nc -z localhost 9090; do | |
sleep 0.1 | |
done | |
- name: Set up Go | |
uses: actions/setup-go@v5 | |
with: | |
go-version: '1.21' | |
- name: Install k6 and xk6-sse | |
id: k6_installation | |
run: | | |
cd examples/server/bench | |
go install go.k6.io/xk6/cmd/xk6@latest | |
xk6 build master \ | |
--with github.com/phymbert/xk6-sse | |
- name: Build | |
id: cmake_build | |
run: | | |
set -eux | |
cmake -B build \ | |
-DLLAMA_NATIVE=OFF \ | |
-DLLAMA_BUILD_SERVER=ON \ | |
-DLLAMA_CURL=ON \ | |
-DLLAMA_CUBLAS=ON \ | |
-DCUDAToolkit_ROOT=/usr/local/cuda \ | |
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ | |
-DCMAKE_CUDA_ARCHITECTURES=75 \ | |
-DLLAMA_FATAL_WARNINGS=OFF \ | |
-DLLAMA_ALL_WARNINGS=OFF \ | |
-DCMAKE_BUILD_TYPE=Release; | |
cmake --build build --config Release -j $(nproc) --target server | |
- name: Download the dataset | |
id: download_dataset | |
run: | | |
cd examples/server/bench | |
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | |
- name: Server bench | |
id: server_bench | |
run: | | |
set -eux | |
cd examples/server/bench | |
source venv/bin/activate | |
python bench.py \ | |
--runner-label ${{ env.RUNNER_LABEL }} \ | |
--name ${{ github.job }} \ | |
--branch ${{ github.head_ref || github.ref_name }} \ | |
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ | |
--scenario script.js \ | |
--duration ${{ github.event.inputs.duration || env.DURATION }} \ | |
--hf-repo ggml-org/models \ | |
--hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \ | |
--model-path-prefix /models \ | |
--parallel ${{ env.N_USERS }} \ | |
-ngl 33 \ | |
--batch-size 2048 \ | |
--ubatch-size 256 \ | |
--ctx-size 16384 \ | |
--n-prompts 1000 \ | |
--max-prompt-tokens 1024 \ | |
--max-tokens 2048 | |
cat results.github.env >> $GITHUB_ENV | |
# Remove dataset as we do not want it in the artefact | |
rm ShareGPT_V3_unfiltered_cleaned_split.json | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} | |
compression-level: 9 | |
path: | | |
examples/server/bench/*.jpg | |
examples/server/bench/*.json | |
examples/server/bench/*.log | |
- name: Commit status | |
uses: Sibz/github-status-action@v1 | |
with: | |
authToken: ${{secrets.GITHUB_TOKEN}} | |
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} | |
context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} | |
description: | | |
${{ env.BENCH_RESULTS }} | |
state: 'success' | |
- name: Upload benchmark images | |
uses: devicons/[email protected] | |
continue-on-error: true # Important as it looks unstable: 503 | |
id: imgur_step | |
with: | |
client_id: ${{secrets.IMGUR_CLIENT_ID}} | |
path: | | |
examples/server/bench/prompt_tokens_seconds.jpg | |
examples/server/bench/predicted_tokens_seconds.jpg | |
examples/server/bench/kv_cache_usage_ratio.jpg | |
examples/server/bench/requests_processing.jpg | |
- name: Extract mermaid | |
id: set_mermaid | |
run: | | |
set -eux | |
cd examples/server/bench | |
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid) | |
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV | |
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid) | |
echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV | |
echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid) | |
echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV | |
echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
REQUESTS_PROCESSING=$(cat requests_processing.mermaid) | |
echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV | |
echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV | |
echo "EOF" >> $GITHUB_ENV | |
- name: Extract image url | |
id: extract_image_url | |
continue-on-error: true | |
run: | | |
set -eux | |
echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV | |
echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV | |
echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV | |
echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV | |
- name: Comment PR | |
uses: mshick/add-pr-comment@v2 | |
id: comment_pr | |
if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }} | |
with: | |
message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} | |
message: | | |
<p align="center"> | |
π **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** π | |
</p> | |
<details> | |
<summary>Expand details for performance related PR only</summary> | |
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} | |
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} | |
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s | |
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s | |
- ${{ env.BENCH_GRAPH_XLABEL }} | |
<p align="center"> | |
<img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" /> | |
<details> | |
<summary>More</summary> | |
```mermaid | |
${{ env.PROMPT_TOKENS_SECONDS }} | |
``` | |
</details> | |
<img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/> | |
<details> | |
<summary>More</summary> | |
```mermaid | |
${{ env.PREDICTED_TOKENS_SECONDS }} | |
``` | |
</details> | |
</p> | |
<details> | |
<summary>Details</summary> | |
<p align="center"> | |
<img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" /> | |
<details> | |
<summary>More</summary> | |
```mermaid | |
${{ env.KV_CACHE_USAGE_RATIO }} | |
``` | |
</details> | |
<img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/> | |
<details> | |
<summary>More</summary> | |
```mermaid | |
${{ env.REQUESTS_PROCESSING }} | |
``` | |
</details> | |
</p> | |
</details> | |
</details> | |