Upload folder using huggingface_hub

Files changed (5) hide show

config.json CHANGED Viewed

@@ -22,7 +22,7 @@
   "rope_theta": 10000.0,
   "tie_word_embeddings": false,
   "torch_dtype": "float32",
-  "transformers_version": "1.6.0.20231121",
   "use_cache": true,
   "vocab_size": 32003
 }

   "rope_theta": 10000.0,
   "tie_word_embeddings": false,
   "torch_dtype": "float32",
+  "transformers_version": "1.6.0.20231122",
   "use_cache": true,
   "vocab_size": 32003
 }

model-orig.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e53c836a8f11f46fc72285b2e4a3c3d36b6ded1476d8ef8b6c557e44b01b5150
-size 1507291

 version https://git-lfs.github.com/spec/v1
+oid sha256:b4b8075e7628dabe70c67c2be3b8ab3422b5a53ace4f26cc2c6b0a14837ce11e
+size 1446314

model.data CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7307f75e29d15fa819f2ed5a87f5f73707972b7e0ff4ba473379004a297d092c
-size 14083016704

 version https://git-lfs.github.com/spec/v1
+oid sha256:e84a03128829eade0bb965b7e873f9d7adbc461d12f7172dc497876e50efb8f4
+size 14091612943

model.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9d10256e6878786d1e850a33e4ad89d6d8e3f88380e8684544f43584b88be6df
-size 1489716

 version https://git-lfs.github.com/spec/v1
+oid sha256:364bf53db1ca1d5288dd847cd99a7b99bb7918475eabd9f6beea40380764715d
+size 1428739

recipe.yaml CHANGED Viewed

@@ -11,16 +11,29 @@ test_stage:
         - LlamaRotaryEmbedding
         - LlamaRMSNorm
         - SiLUActivation
-        # Skip quantizing the BMMs
-        - QuantizableMatMul
         # Skip quantizing the layers with the most sensitive activations
         - model.layers.3.mlp.down_proj
         - model.layers.38.mlp.down_proj
         - model.layers.39.mlp.down_proj
         - model.layers.0.mlp.down_proj
         - model.layers.37.mlp.down_proj
       post_oneshot_calibration: true
       scheme_overrides:
         Embedding:
           input_activations: null
           weights:
@@ -34,3 +47,4 @@ test_stage:
       percdamp: 0.01
       mask_structure: "0:0"
       targets: ["re:model.layers.\\d*$"]

         - LlamaRotaryEmbedding
         - LlamaRMSNorm
         - SiLUActivation
         # Skip quantizing the layers with the most sensitive activations
         - model.layers.3.mlp.down_proj
         - model.layers.38.mlp.down_proj
         - model.layers.39.mlp.down_proj
         - model.layers.0.mlp.down_proj
         - model.layers.37.mlp.down_proj
+        - MatMulOutput_QK
+        - MatMulOutput_PV
       post_oneshot_calibration: true
       scheme_overrides:
+        Linear:
+          weights:
+            num_bits: 8
+            symmetric: true
+            strategy: channel
+        MatMulLeftInput_QK:
+          input_activations:
+            num_bits: 8
+            symmetric: true
+        MatMulLeftInput_PV:
+          input_activations:
+            num_bits: 8
+            symmetric: true
         Embedding:
           input_activations: null
           weights:
       percdamp: 0.01
       mask_structure: "0:0"
       targets: ["re:model.layers.\\d*$"]