compilade commited on
Commit
bc00f19
·
1 Parent(s): a57c29d

Add benchmarking script

Browse files
Files changed (2) hide show
  1. BENCHMARKING.md +26 -0
  2. bench-TriLMs.sh +101 -0
BENCHMARKING.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Benchmarking models
2
+
3
+ To use `bench-TriLMs.sh`, you need to
4
+
5
+ - Place it in a `llama.cpp` checkout
6
+ - Have `cmake`, `gcc`, and other dependencies of `llama.cpp`
7
+ - If you want to benchmark on GPUs, the script checks if `nvidia-smi` is present, and you'll also need the necessary compile-time dependencies
8
+
9
+ The script will automatically download the models and quantize different variants.
10
+
11
+ It will then produce 2 result files, one called `results-$(date +%s).json` and the other called `results-$(date +%s)-cpuinfo.txt`. Both will use the exact same date.
12
+
13
+ The intention is to eventually read the produced `.json` in a Python script with
14
+
15
+ ```python3
16
+ from __future__ import annotations
17
+
18
+ from typing import Any
19
+ import json
20
+
21
+ with open("result-1234567890.json") as f:
22
+ data: list[list[dict[str, Any]]] = json.loads("[" + f.read() + "]")
23
+
24
+ # Then use that data
25
+ ...
26
+ ```
bench-TriLMs.sh ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -eux
3
+
4
+ cd "$(dirname "$0")"
5
+
6
+ MODEL_DIR="bench-TriLMs-models"
7
+ LLAMA_CPP_PATH="."
8
+ sizes=("1.5" "2.4" "3.9")
9
+ types=("TQ1_0" "TQ2_0" "Q8_0" "F16" "BF16")
10
+ gputypes=("Q8_0" "F16" "BF16")
11
+
12
+ function gather_models() {
13
+ echo Gather the models
14
+ if [ ! -d "$MODEL_DIR" ]; then
15
+ mkdir -p -- "$MODEL_DIR"
16
+ fi
17
+ (
18
+ cd "$MODEL_DIR"
19
+ for sz in "${sizes[@]}"; do
20
+ filename="TriLM_${sz}B_Unpacked-TQ1_0-F16.gguf"
21
+ if [ ! -f "$filename" ]; then
22
+ wget "https://huggingface.co/compilade/quant-tests/resolve/main/${filename}"
23
+ fi
24
+ done
25
+ )
26
+ }
27
+
28
+ function build_llama_cpp() {
29
+ echo Build llama.cpp for CPU
30
+
31
+ (
32
+ cd -- "$LLAMA_CPP_PATH"
33
+ if [ -d build ]; then
34
+ pwd
35
+ echo 'rm -rI build'
36
+ rm -rI build
37
+ fi
38
+ mkdir build
39
+ cd build
40
+ cmake .. "$@"
41
+ make -j llama-bench llama-quantize
42
+ )
43
+ }
44
+
45
+ function quantize() {
46
+ echo "Make all model types we'll test"
47
+ (
48
+ for sz in "${sizes[@]}"; do
49
+ for ty in "${types[@]}"; do
50
+ filenames=("$MODEL_DIR"/TriLM_"${sz}"B_Unpacked-{TQ1_0-F16,"$ty"}.gguf)
51
+ if [ ! -f "${filenames[1]}" ]; then
52
+ "$LLAMA_CPP_PATH"/build/bin/llama-quantize --allow-requantize "${filenames[@]}" "$ty"
53
+ fi
54
+ done
55
+ done
56
+ )
57
+ }
58
+
59
+ function bench() {
60
+ echo Test each model one by one for different numbers of threads
61
+
62
+ for sz in "${sizes[@]}"; do
63
+ for ty in "$@"; do
64
+ for th in 1 2 4 8; do
65
+ {
66
+ "$LLAMA_CPP_PATH"/build/bin/llama-bench -v -m "${MODEL_DIR}/TriLM_${sz}B_Unpacked-${ty}.gguf" -t "${th}" -p 512 -n 128 -r 4 -o json
67
+ printf "%s\n" ","
68
+ }
69
+ done
70
+ done
71
+ done
72
+ }
73
+
74
+ function bench_cpu() {
75
+ bench "${types[@]}" >> "$1"
76
+ }
77
+
78
+ function bench_gpu() {
79
+ bench "${gputypes[@]}" >> "$1"
80
+ }
81
+
82
+ currentTime="$(date +'%s')"
83
+ resultFile="results-${currentTime}.json"
84
+ infoFile="results-${currentTime}-info.txt"
85
+ lscpu > "$infoFile"
86
+
87
+ gather_models
88
+ build_llama_cpp -DGGML_NATIVE=ON -DGGML_CPU=ON
89
+ quantize
90
+
91
+ echo "---" >> "$infoFile"
92
+ ls -go "$MODEL_DIR" >> "$infoFile"
93
+
94
+ bench_cpu "$resultFile"
95
+
96
+ if [ -x "$(command -v nvidia-smi)" ]; then
97
+ echo GPU detected, benchark with that too.
98
+ build_llama_cpp -DGGML_NATIVE=ON -DGGML_CUDA=ON -DGGML_CUDA_F16=ON
99
+ bench_gpu "$resultFile"
100
+ fi
101
+