add asymm quantized model, add two eos in code sample

Files changed (5) hide show

README.md +15 -6
compressa-config.json +4 -4
config.json +1 -1
model-00001-of-00002.safetensors +1 -1
quant_config.json +1 -1

README.md CHANGED Viewed

@@ -28,7 +28,7 @@ Quantized with [OmniQuant](https://github.com/OpenGVLab/OmniQuant).
 |           | wiki |
 | --------- | ---- |
 | FP        | 8,29 |
-| **Quantized** | 9,15 |
 ### Accuracy on English Benchmarks, % (↑)
@@ -36,7 +36,7 @@ Quantized with [OmniQuant](https://github.com/OpenGVLab/OmniQuant).
 |           | piqa | arc_easy | arc_challenge | boolq | hellaswag | winogrande | mmlu_humanities | mmlu_social_sciences | mmlu_stem | mmlu_other |
 | --------- | ---- | -------- | ------------- | ----- | --------- | ---------- | --------------- | -------------------- | --------- | ---------- |
 | FP        | 78,7 | 81,6     | 53,0          | 83,1  | 57,7      | 72,1       | 67,0            | 70,9                 | 54,5      | 68,2       |
-| **Quantized** | 77,3 | 80,1     | 47,7          | 82,4  | 56,7      | 70,5       | 63,5            | 70,1                 | 50,5      | 64,2       |
 ### Accuracy on Russian Benchmarks, % (↑)
@@ -44,15 +44,15 @@ Quantized with [OmniQuant](https://github.com/OpenGVLab/OmniQuant).
 |           | danetqa | terra | rwsd | muserc | rucos | lidirus | parus | rcb  | russe | rucola |
 | --------- | ------- | ----- | ---- | ------ | ----- | ------- | ----- | ---- | ----- | ------ |
 | FP        | 78,6    | 60,9  | 65,7 | 56,1   | 64,9  | 63,2    | 71,0  | 34,1 | 60,8  | 64,1   |
-| **Quantized** | 67,8    | 52,4  | 53,9 | 55,7   | 59,5  | 58,2    | 73,0  | 35,0 | 62,0  | 64,0   |
 ### Summary
 |           | Avg acc diff on Eng, % (↑) | Avg acc diff on Rus, % (↑) | Occupied disk space, % (↓) |
-| --------- | -------------------------- | -------------------------- | ---------------------- |
-| FP        | 0                          | 0                          | 100                    |
-| **Quantized** | \-2,11                     | \-1,60                     | 35,7                   |
 ## Examples
@@ -178,6 +178,13 @@ tokenizer = AutoTokenizer.from_pretrained(
     model_path, use_fast=False, trust_remote_code=True
 )
 system_message = "You are a friendly chatbot who always responds in the style of a pirate."
 user_message = "Where are we going, Captain?"
 messages = [
@@ -194,6 +201,7 @@ inputs = {k: v.cuda() for k, v in inputs.items()}
 outputs = model.generate(
     **inputs, max_new_tokens=512,
     do_sample=True, temperature=0.7, top_p=0.95,
 )
 response = tokenizer.decode(outputs[0])
@@ -210,6 +218,7 @@ print(f'Continuation:\n{continuation}\n')
 pipe = pipeline(
     "text-generation",
     model=model, tokenizer=tokenizer,
     max_new_tokens=512, do_sample=True,
     temperature=0.7, top_p=0.95,
     device=0,

 |           | wiki |
 | --------- | ---- |
 | FP        | 8,29 |
+| **Quantized** | 8,97 |
 ### Accuracy on English Benchmarks, % (↑)
 |           | piqa | arc_easy | arc_challenge | boolq | hellaswag | winogrande | mmlu_humanities | mmlu_social_sciences | mmlu_stem | mmlu_other |
 | --------- | ---- | -------- | ------------- | ----- | --------- | ---------- | --------------- | -------------------- | --------- | ---------- |
 | FP        | 78,7 | 81,6     | 53,0          | 83,1  | 57,7      | 72,1       | 67,0            | 70,9                 | 54,5      | 68,2       |
+| **Quantized** | 77,2 | 80,7     | 51,8          | 82,8  | 56,8      | 72,5       | 63,4            | 67,6                 | 50,1      | 65,0       |
 ### Accuracy on Russian Benchmarks, % (↑)
 |           | danetqa | terra | rwsd | muserc | rucos | lidirus | parus | rcb  | russe | rucola |
 | --------- | ------- | ----- | ---- | ------ | ----- | ------- | ----- | ---- | ----- | ------ |
 | FP        | 78,6    | 60,9  | 65,7 | 56,1   | 64,9  | 63,2    | 71,0  | 34,1 | 60,8  | 64,1   |
+| **Quantized** | 71,6    | 60,6  | 52,5 | 63,7   | 57,3  | 57,2    | 74,0  | 33,6 | 36,9  | 67,5   |
 ### Summary
 |           | Avg acc diff on Eng, % (↑) | Avg acc diff on Rus, % (↑) | Occupied disk space, % (↓) |
+| --------- | -------------------------- | -------------------------- | -------------------------- |
+| FP        | 0                          | 0                          | 100                        |
+| **Quantized** | \-1,9                      | \-4,5                      | 35,7                       |
 ## Examples
     model_path, use_fast=False, trust_remote_code=True
 )
+# Llama 3 "specifics"
+# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/discussions/4
+terminators = [
+    tokenizer.convert_tokens_to_ids("<|end_of_text|>"),
+    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+]
 system_message = "You are a friendly chatbot who always responds in the style of a pirate."
 user_message = "Where are we going, Captain?"
 messages = [
 outputs = model.generate(
     **inputs, max_new_tokens=512,
     do_sample=True, temperature=0.7, top_p=0.95,
+    eos_token_id=terminators,
 )
 response = tokenizer.decode(outputs[0])
 pipe = pipeline(
     "text-generation",
     model=model, tokenizer=tokenizer,
+    eos_token_id=terminators,
     max_new_tokens=512, do_sample=True,
     temperature=0.7, top_p=0.95,
     device=0,

compressa-config.json CHANGED Viewed

@@ -4,19 +4,19 @@
         "wbits": 4,
         "abits": 16,
         "group_size": 128,
-        "symmetric": true
     },
     "resume": null,
     "start_sample": 0,
     "nsamples": 128,
-    "epochs": 10,
     "aug_loss": true,
     "eval_ppl": true,
     "real_quant": true,
     "lwc_lr": 0.01,
     "use_lr_scheduler": false,
     "cache_dir": "resources/cache",
-    "output_dir": "resources/models/models/NousResearch_Meta-Llama-3-8B-Instruct_omniquant/logs",
-    "save_dir": "resources/models/models/NousResearch_Meta-Llama-3-8B-Instruct_omniquant/NousResearch_Meta-Llama-3-8B-Instruct",
     "config_class": "OmniquantConfig"
 }

         "wbits": 4,
         "abits": 16,
         "group_size": 128,
+        "symmetric": false
     },
     "resume": null,
     "start_sample": 0,
     "nsamples": 128,
+    "epochs": 20,
     "aug_loss": true,
     "eval_ppl": true,
     "real_quant": true,
     "lwc_lr": 0.01,
     "use_lr_scheduler": false,
     "cache_dir": "resources/cache",
+    "output_dir": "resources/models/models/NousResearch_Meta-Llama-3-8B-Instruct_omniquant_asymm_e20/logs",
+    "save_dir": "resources/models/models/NousResearch_Meta-Llama-3-8B-Instruct_omniquant_asymm_e20/NousResearch_Meta-Llama-3-8B-Instruct",
     "config_class": "OmniquantConfig"
 }

config.json CHANGED Viewed

@@ -29,7 +29,7 @@
     "quant_method": "gptq",
     "bits": 4,
     "group_size": 128,
-    "sym": true,
     "desc_act": true,
     "disable_exllama": true
   }

     "quant_method": "gptq",
     "bits": 4,
     "group_size": 128,
+    "sym": false,
     "desc_act": true,
     "disable_exllama": true
   }

model-00001-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5905b82619ce372a2773b7de23f5d9ce164a37d4a5ff0a39dfceeaeb0de181c3
 size 4682270360

 version https://git-lfs.github.com/spec/v1
+oid sha256:1e6dabb82f808aafc414164a93f5c0ba4a631ec63f331bea0fc5e330c691e2c0
 size 4682270360

quant_config.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"wbits": 4, "abits": 16, "group_size": 128, "symmetric": ~~true~~}


1	+ {"wbits": 4, "abits": 16, "group_size": 128, "symmetric": false}