Upload folder using huggingface_hub (#2)

- 7c9a9d2b4cb1e4c431ecc30299a724a301048b97d8b211970db13ac36f13f5a0 (5f6dabd80cfe0a31a734e4d9235bead6bb0d8dad)
- 785f37f6660877dca19109cb08dca4847ff00c85b43080369daa95094e6fe000 (3292dfef2acf53c7c3d9b3707e07fb2c944abeca)

Files changed (5) hide show

README.md +7 -7
config.json +44 -45
generation_config.json +1 -1
model.safetensors +2 -2
smash_config.json +11 -23

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 thumbnail: "https://assets-global.website-files.com/646b351987a8d8ce158d1940/64ec9e96b4334c0e1ac41504_Logo%20with%20white%20text.svg"
-base_model: PowerInfer/SmallThinker-3B-Preview
 metrics:
 - memory_disk
 - memory_inference
@@ -14,8 +14,8 @@ tags:
 <!-- header start -->
 <!-- 200823 -->
 <div style="width: auto; margin-left: auto; margin-right: auto">
-    <a href="https://docs.pruna.ai/en/latest/setup/pip.html" target="_blank" rel="noopener noreferrer">
-        <img src="https://imgur.com/rVAgqMY.png" alt="PrunaAI" style="width: 100%; min-width: 400px; display: block; margin: auto;">
     </a>
 </div>
 <!-- header end -->
@@ -52,7 +52,7 @@ tags:
 You can run the smashed model with these steps:
-0. Check requirements from the original repo PowerInfer/SmallThinker-3B-Preview installed. In particular, check python, cuda, and transformers versions.
 1. Make sure that you have installed quantization related packages.
     ```bash
     pip install transformers accelerate bitsandbytes>0.37.0
@@ -63,7 +63,7 @@ You can run the smashed model with these steps:
    model = AutoModelForCausalLM.from_pretrained("PrunaAI/PowerInfer-SmallThinker-3B-Preview-bnb-8bit-smashed", trust_remote_code=True, device_map='auto')
-   tokenizer = AutoTokenizer.from_pretrained("PowerInfer/SmallThinker-3B-Preview")
    input_ids = tokenizer("What is the color of prunes?,", return_tensors='pt').to(model.device)["input_ids"]
@@ -77,9 +77,9 @@ The configuration info are in `smash_config.json`.
 ## Credits & License
-The license of the smashed model follows the license of the original model. Please check the license of the original model PowerInfer/SmallThinker-3B-Preview before using this model which provided the base model. The license  of the `pruna-engine` is [here](https://pypi.org/project/pruna-engine/) on Pypi.
 ## Want to compress other models?
 - Contact us and tell us which model to compress next [here](https://www.pruna.ai/contact).
-- Do it by yourself [here](https://docs.pruna.ai/en/latest/setup/pip.html).

 ---
 thumbnail: "https://assets-global.website-files.com/646b351987a8d8ce158d1940/64ec9e96b4334c0e1ac41504_Logo%20with%20white%20text.svg"
+base_model: ORIGINAL_REPO_NAME
 metrics:
 - memory_disk
 - memory_inference
 <!-- header start -->
 <!-- 200823 -->
 <div style="width: auto; margin-left: auto; margin-right: auto">
+    <a href="https://www.pruna.ai/" target="_blank" rel="noopener noreferrer">
+        <img src="https://i.imgur.com/eDAlcgk.png" alt="PrunaAI" style="width: 100%; min-width: 400px; display: block; margin: auto;">
     </a>
 </div>
 <!-- header end -->
 You can run the smashed model with these steps:
+0. Check requirements from the original repo ORIGINAL_REPO_NAME installed. In particular, check python, cuda, and transformers versions.
 1. Make sure that you have installed quantization related packages.
     ```bash
     pip install transformers accelerate bitsandbytes>0.37.0
    model = AutoModelForCausalLM.from_pretrained("PrunaAI/PowerInfer-SmallThinker-3B-Preview-bnb-8bit-smashed", trust_remote_code=True, device_map='auto')
+   tokenizer = AutoTokenizer.from_pretrained("ORIGINAL_REPO_NAME")
    input_ids = tokenizer("What is the color of prunes?,", return_tensors='pt').to(model.device)["input_ids"]
 ## Credits & License
+The license of the smashed model follows the license of the original model. Please check the license of the original model ORIGINAL_REPO_NAME before using this model which provided the base model. The license  of the `pruna-engine` is [here](https://pypi.org/project/pruna-engine/) on Pypi.
 ## Want to compress other models?
 - Contact us and tell us which model to compress next [here](https://www.pruna.ai/contact).
+- Request access to easily compress your own AI models [here](https://z0halsaff74.typeform.com/pruna-access?typeform-source=www.pruna.ai).

config.json CHANGED Viewed

@@ -1,47 +1,46 @@
 {
-    "_name_or_path": "/covalent/.cache/models/tmpjx0icd9ti1ztb9qy",
-    "architectures": [
-        "Qwen2ForCausalLM"
     ],
-    "attention_dropout": 0.0,
-    "bos_token_id": 151643,
-    "eos_token_id": 151645,
-    "hidden_act": "silu",
-    "hidden_size": 2048,
-    "initializer_range": 0.02,
-    "intermediate_size": 11008,
-    "max_position_embeddings": 32768,
-    "max_window_layers": 70,
-    "model_type": "qwen2",
-    "num_attention_heads": 16,
-    "num_hidden_layers": 36,
-    "num_key_value_heads": 2,
-    "quantization_config": {
-        "_load_in_4bit": false,
-        "_load_in_8bit": true,
-        "bnb_4bit_compute_dtype": "bfloat16",
-        "bnb_4bit_quant_storage": "uint8",
-        "bnb_4bit_quant_type": "fp4",
-        "bnb_4bit_use_double_quant": false,
-        "llm_int8_enable_fp32_cpu_offload": false,
-        "llm_int8_has_fp16_weight": false,
-        "llm_int8_skip_modules": [
-            "lm_head"
-        ],
-        "llm_int8_threshold": 6.0,
-        "load_in_4bit": false,
-        "load_in_8bit": true,
-        "quant_method": "bitsandbytes"
-    },
-    "rms_norm_eps": 1e-06,
-    "rope_scaling": null,
-    "rope_theta": 1000000.0,
-    "sliding_window": null,
-    "tie_word_embeddings": true,
-    "torch_dtype": "float16",
-    "transformers_version": "4.46.2",
-    "use_cache": true,
-    "use_sliding_window": false,
-    "vocab_size": 151936,
-    "api_key": null
-}

 {
+  "_name_or_path": "/tmp/models/tmploet3pwbdsmvc02y",
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 70,
+  "model_type": "qwen2",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
+  "quantization_config": {
+    "_load_in_4bit": false,
+    "_load_in_8bit": true,
+    "bnb_4bit_compute_dtype": "bfloat16",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": false,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": [
+      "lm_head"
     ],
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": false,
+    "load_in_8bit": true,
+    "quant_method": "bitsandbytes"
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.2",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

generation_config.json CHANGED Viewed

@@ -10,5 +10,5 @@
   "temperature": 0.7,
   "top_k": 20,
   "top_p": 0.8,
-  "transformers_version": "4.46.2"
 }

   "temperature": 0.7,
   "top_k": 20,
   "top_p": 0.8,
+  "transformers_version": "4.48.2"
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:01fcca5ec6c29d5eb71d9a6d849052698462315c502da99ecbaf24007fd54117
-size 3401576604

 version https://git-lfs.github.com/spec/v1
+oid sha256:77503f073d91dbccd784bf4bf0caf997a7c7d06375258430c9a6837fd0e495e5
+size 3401576788

smash_config.json CHANGED Viewed

@@ -1,24 +1,11 @@
 {
-    "comp_cgenerate_active": false,
-    "comp_ctranslate_active": false,
-    "comp_cwhisper_active": false,
-    "comp_diffusers2_active": false,
-    "comp_ifw_active": false,
-    "comp_onediff_active": false,
-    "comp_step_caching_active": false,
-    "comp_torch_compile_active": false,
-    "comp_ws2t_active": false,
-    "comp_x-fast_active": false,
-    "prune_torch-structured_active": false,
-    "quant_aqlm_active": false,
-    "quant_awq_active": false,
-    "quant_gptq_active": false,
-    "quant_half_active": false,
-    "quant_hqq_active": false,
-    "quant_llm-int8_active": true,
-    "quant_quanto_active": false,
-    "quant_torch_dynamic_active": false,
-    "quant_torch_static_active": false,
     "quant_llm-int8_compute_dtype": "bfloat16",
     "quant_llm-int8_double_quant": false,
     "quant_llm-int8_enable_fp32_cpu_offload": false,
@@ -28,8 +15,9 @@
     "quant_llm-int8_weight_bits": 8,
     "max_batch_size": 1,
     "device": "cuda",
-    "cache_dir": "/covalent/.cache/models/tmpjx0icd9t",
     "task": "",
-    "save_load_fn": "bitsandbytes",
-    "save_load_fn_args": {}
 }

 {
+    "batchers": null,
+    "cachers": null,
+    "compilers": null,
+    "distillers": null,
+    "pruners": null,
+    "quantizers": "llm-int8",
+    "recoverers": null,
     "quant_llm-int8_compute_dtype": "bfloat16",
     "quant_llm-int8_double_quant": false,
     "quant_llm-int8_enable_fp32_cpu_offload": false,
     "quant_llm-int8_weight_bits": 8,
     "max_batch_size": 1,
     "device": "cuda",
+    "cache_dir": "/tmp/models/tmploet3pwb",
     "task": "",
+    "save_load_fn": "llm-int8",
+    "save_load_fn_args": {},
+    "api_key": null
 }