Update app.py (#89)
Browse files- Update app.py (511f793683b66e3d1deff722345f2892f84ec845)
app.py
CHANGED
@@ -19,7 +19,7 @@ from textwrap import dedent
|
|
19 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
20 |
|
21 |
def generate_importance_matrix(model_path, train_data_path):
|
22 |
-
imatrix_command = f"./imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
|
23 |
|
24 |
os.chdir("llama.cpp")
|
25 |
|
@@ -146,9 +146,9 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
|
|
146 |
quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
|
147 |
quantized_gguf_path = quantized_gguf_name
|
148 |
if use_imatrix:
|
149 |
-
quantise_ggml = f"./llama.cpp/quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
|
150 |
else:
|
151 |
-
quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
|
152 |
result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
|
153 |
if result.returncode != 0:
|
154 |
raise Exception(f"Error quantizing: {result.stderr}")
|
@@ -186,7 +186,7 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
|
|
186 |
|
187 |
### CLI:
|
188 |
```bash
|
189 |
-
llama --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
|
190 |
```
|
191 |
|
192 |
### Server:
|
@@ -208,11 +208,11 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
|
|
208 |
|
209 |
Step 3: Run inference through the main binary.
|
210 |
```
|
211 |
-
./
|
212 |
```
|
213 |
or
|
214 |
```
|
215 |
-
./server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
|
216 |
```
|
217 |
"""
|
218 |
)
|
|
|
19 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
20 |
|
21 |
def generate_importance_matrix(model_path, train_data_path):
|
22 |
+
imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
|
23 |
|
24 |
os.chdir("llama.cpp")
|
25 |
|
|
|
146 |
quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
|
147 |
quantized_gguf_path = quantized_gguf_name
|
148 |
if use_imatrix:
|
149 |
+
quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
|
150 |
else:
|
151 |
+
quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
|
152 |
result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
|
153 |
if result.returncode != 0:
|
154 |
raise Exception(f"Error quantizing: {result.stderr}")
|
|
|
186 |
|
187 |
### CLI:
|
188 |
```bash
|
189 |
+
llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
|
190 |
```
|
191 |
|
192 |
### Server:
|
|
|
208 |
|
209 |
Step 3: Run inference through the main binary.
|
210 |
```
|
211 |
+
./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
|
212 |
```
|
213 |
or
|
214 |
```
|
215 |
+
./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
|
216 |
```
|
217 |
"""
|
218 |
)
|