turingevo commited on
Commit
01f01b5
·
verified ·
1 Parent(s): 416d6bc

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -38,3 +38,8 @@ qwen2vl-vision.gguf filter=lfs diff=lfs merge=lfs -text
38
  linux-x64-openlas-llama-qwen2vl-cli filter=lfs diff=lfs merge=lfs -text
39
  Qwen2-VL-2B-Instruct-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
40
  qwen2-vl-2b-instruct-vision.gguf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
38
  linux-x64-openlas-llama-qwen2vl-cli filter=lfs diff=lfs merge=lfs -text
39
  Qwen2-VL-2B-Instruct-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
40
  qwen2-vl-2b-instruct-vision.gguf filter=lfs diff=lfs merge=lfs -text
41
+ Qwen2-VL-2B-Instruct-BF16.gguf filter=lfs diff=lfs merge=lfs -text
42
+ Qwen2-VL-2B-Instruct-IQ3_M.gguf filter=lfs diff=lfs merge=lfs -text
43
+ Qwen2-VL-2B-Instruct-IQ3_S.gguf filter=lfs diff=lfs merge=lfs -text
44
+ Qwen2-VL-2B-Instruct-Q2_K.gguf filter=lfs diff=lfs merge=lfs -text
45
+ Qwen2-VL-2B-Instruct-Q3_K.gguf filter=lfs diff=lfs merge=lfs -text
Qwen2-VL-2B-Instruct-BF16.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06de2fcc6a4bf1064e0d8950f4337387c94626d2d2fa6bef2a9bda6655ae4fd7
3
+ size 3093667552
Qwen2-VL-2B-Instruct-IQ3_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09c7dd2f7315b7b877fc00ba0402bc67aa44bb1fe4553b14f09ee9b1d95563b7
3
+ size 776662240
Qwen2-VL-2B-Instruct-IQ3_S.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:429c7c026387d12fcefe23a174af54db608a0051a1752c94eb8bcb4dc88fd76b
3
+ size 762405088
Qwen2-VL-2B-Instruct-Q2_K.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87bb698addb5ca28c420a74bdf84fd61ece3b43a2f985f7118436db7b53ac84
3
+ size 676303072
Qwen2-VL-2B-Instruct-Q3_K.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5fca60e4c3222faf8f2e1159ca246786535ae9c884d13bce8c055de8c8437ba
3
+ size 824176864
help ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # llama.cpp
2
+
3
+
4
+ 1. 转换语言模型 生成 gguf
5
+ python convert_hf_to_gguf.py ./model_path --outtype f32
6
+
7
+ 2. 量化语言模型:
8
+ ./llama-quantize ./model_path/Qwen2-VL-2B-Instruct-F32.gguf Qwen2-VL-2B-Instruct-Q4_K_M.gguf Q4_K_M
9
+
10
+ 3. 转换视觉模型
11
+ python examples/llava/qwen2_vl_surgery.py ./model_path
12
+
13
+ 4. 推理
14
+ llama-qwen2vl-cli -m Qwen2-VL-2B-Instruct-Q4_K_M.gguf --mmproj qwen2-vl-2b-instruct-vision.gguf -p "描述这图片" --image "1.png"
15
+
16
+
17
+
18
+
19
+ ######## llama-quantize
20
+
21
+ usage: ./llama-quantize [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]
22
+
23
+ --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit
24
+ --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing
25
+ --pure: Disable k-quant mixtures and quantize all tensors to the same type
26
+ --imatrix file_name: use data in file_name as importance matrix for quant optimizations
27
+ --include-weights tensor_name: use importance matrix for this/these tensor(s)
28
+ --exclude-weights tensor_name: use importance matrix for this/these tensor(s)
29
+ --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor
30
+ --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor
31
+ --keep-split: will generate quantized model in the same shards as input
32
+ --override-kv KEY=TYPE:VALUE
33
+ Advanced option to override model metadata by key in the quantized model. May be specified multiple times.
34
+ Note: --include-weights and --exclude-weights cannot be used together
35
+
36
+ Allowed quantization types:
37
+ 2 or Q4_0 : 4.34G, +0.4685 ppl @ Llama-3-8B
38
+ 3 or Q4_1 : 4.78G, +0.4511 ppl @ Llama-3-8B
39
+ 8 or Q5_0 : 5.21G, +0.1316 ppl @ Llama-3-8B
40
+ 9 or Q5_1 : 5.65G, +0.1062 ppl @ Llama-3-8B
41
+ 19 or IQ2_XXS : 2.06 bpw quantization
42
+ 20 or IQ2_XS : 2.31 bpw quantization
43
+ 28 or IQ2_S : 2.5 bpw quantization
44
+ 29 or IQ2_M : 2.7 bpw quantization
45
+ 24 or IQ1_S : 1.56 bpw quantization
46
+ 31 or IQ1_M : 1.75 bpw quantization
47
+ 36 or TQ1_0 : 1.69 bpw ternarization
48
+ 37 or TQ2_0 : 2.06 bpw ternarization
49
+ 10 or Q2_K : 2.96G, +3.5199 ppl @ Llama-3-8B
50
+ 21 or Q2_K_S : 2.96G, +3.1836 ppl @ Llama-3-8B
51
+ 23 or IQ3_XXS : 3.06 bpw quantization
52
+ 26 or IQ3_S : 3.44 bpw quantization
53
+ 27 or IQ3_M : 3.66 bpw quantization mix
54
+ 12 or Q3_K : alias for Q3_K_M
55
+ 22 or IQ3_XS : 3.3 bpw quantization
56
+ 11 or Q3_K_S : 3.41G, +1.6321 ppl @ Llama-3-8B
57
+ 12 or Q3_K_M : 3.74G, +0.6569 ppl @ Llama-3-8B
58
+ 13 or Q3_K_L : 4.03G, +0.5562 ppl @ Llama-3-8B
59
+ 25 or IQ4_NL : 4.50 bpw non-linear quantization
60
+ 30 or IQ4_XS : 4.25 bpw non-linear quantization
61
+ 15 or Q4_K : alias for Q4_K_M
62
+ 14 or Q4_K_S : 4.37G, +0.2689 ppl @ Llama-3-8B
63
+ 15 or Q4_K_M : 4.58G, +0.1754 ppl @ Llama-3-8B
64
+ 17 or Q5_K : alias for Q5_K_M
65
+ 16 or Q5_K_S : 5.21G, +0.1049 ppl @ Llama-3-8B
66
+ 17 or Q5_K_M : 5.33G, +0.0569 ppl @ Llama-3-8B
67
+ 18 or Q6_K : 6.14G, +0.0217 ppl @ Llama-3-8B
68
+ 7 or Q8_0 : 7.96G, +0.0026 ppl @ Llama-3-8B
69
+ 1 or F16 : 14.00G, +0.0020 ppl @ Mistral-7B
70
+ 32 or BF16 : 14.00G, -0.0050 ppl @ Mistral-7B
71
+ 0 or F32 : 26.00G @ 7B
72
+ COPY : only copy tensors, no quantizing