sharpenb commited on
Commit
35e442e
·
verified ·
1 Parent(s): c93dae2

Upload folder using huggingface_hub (#2)

Browse files

- 7c9a9d2b4cb1e4c431ecc30299a724a301048b97d8b211970db13ac36f13f5a0 (5f6dabd80cfe0a31a734e4d9235bead6bb0d8dad)
- 785f37f6660877dca19109cb08dca4847ff00c85b43080369daa95094e6fe000 (3292dfef2acf53c7c3d9b3707e07fb2c944abeca)

Files changed (5) hide show
  1. README.md +7 -7
  2. config.json +44 -45
  3. generation_config.json +1 -1
  4. model.safetensors +2 -2
  5. smash_config.json +11 -23
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  thumbnail: "https://assets-global.website-files.com/646b351987a8d8ce158d1940/64ec9e96b4334c0e1ac41504_Logo%20with%20white%20text.svg"
3
- base_model: PowerInfer/SmallThinker-3B-Preview
4
  metrics:
5
  - memory_disk
6
  - memory_inference
@@ -14,8 +14,8 @@ tags:
14
  <!-- header start -->
15
  <!-- 200823 -->
16
  <div style="width: auto; margin-left: auto; margin-right: auto">
17
- <a href="https://docs.pruna.ai/en/latest/setup/pip.html" target="_blank" rel="noopener noreferrer">
18
- <img src="https://imgur.com/rVAgqMY.png" alt="PrunaAI" style="width: 100%; min-width: 400px; display: block; margin: auto;">
19
  </a>
20
  </div>
21
  <!-- header end -->
@@ -52,7 +52,7 @@ tags:
52
 
53
  You can run the smashed model with these steps:
54
 
55
- 0. Check requirements from the original repo PowerInfer/SmallThinker-3B-Preview installed. In particular, check python, cuda, and transformers versions.
56
  1. Make sure that you have installed quantization related packages.
57
  ```bash
58
  pip install transformers accelerate bitsandbytes>0.37.0
@@ -63,7 +63,7 @@ You can run the smashed model with these steps:
63
 
64
 
65
  model = AutoModelForCausalLM.from_pretrained("PrunaAI/PowerInfer-SmallThinker-3B-Preview-bnb-8bit-smashed", trust_remote_code=True, device_map='auto')
66
- tokenizer = AutoTokenizer.from_pretrained("PowerInfer/SmallThinker-3B-Preview")
67
 
68
  input_ids = tokenizer("What is the color of prunes?,", return_tensors='pt').to(model.device)["input_ids"]
69
 
@@ -77,9 +77,9 @@ The configuration info are in `smash_config.json`.
77
 
78
  ## Credits & License
79
 
80
- The license of the smashed model follows the license of the original model. Please check the license of the original model PowerInfer/SmallThinker-3B-Preview before using this model which provided the base model. The license of the `pruna-engine` is [here](https://pypi.org/project/pruna-engine/) on Pypi.
81
 
82
  ## Want to compress other models?
83
 
84
  - Contact us and tell us which model to compress next [here](https://www.pruna.ai/contact).
85
- - Do it by yourself [here](https://docs.pruna.ai/en/latest/setup/pip.html).
 
1
  ---
2
  thumbnail: "https://assets-global.website-files.com/646b351987a8d8ce158d1940/64ec9e96b4334c0e1ac41504_Logo%20with%20white%20text.svg"
3
+ base_model: ORIGINAL_REPO_NAME
4
  metrics:
5
  - memory_disk
6
  - memory_inference
 
14
  <!-- header start -->
15
  <!-- 200823 -->
16
  <div style="width: auto; margin-left: auto; margin-right: auto">
17
+ <a href="https://www.pruna.ai/" target="_blank" rel="noopener noreferrer">
18
+ <img src="https://i.imgur.com/eDAlcgk.png" alt="PrunaAI" style="width: 100%; min-width: 400px; display: block; margin: auto;">
19
  </a>
20
  </div>
21
  <!-- header end -->
 
52
 
53
  You can run the smashed model with these steps:
54
 
55
+ 0. Check requirements from the original repo ORIGINAL_REPO_NAME installed. In particular, check python, cuda, and transformers versions.
56
  1. Make sure that you have installed quantization related packages.
57
  ```bash
58
  pip install transformers accelerate bitsandbytes>0.37.0
 
63
 
64
 
65
  model = AutoModelForCausalLM.from_pretrained("PrunaAI/PowerInfer-SmallThinker-3B-Preview-bnb-8bit-smashed", trust_remote_code=True, device_map='auto')
66
+ tokenizer = AutoTokenizer.from_pretrained("ORIGINAL_REPO_NAME")
67
 
68
  input_ids = tokenizer("What is the color of prunes?,", return_tensors='pt').to(model.device)["input_ids"]
69
 
 
77
 
78
  ## Credits & License
79
 
80
+ The license of the smashed model follows the license of the original model. Please check the license of the original model ORIGINAL_REPO_NAME before using this model which provided the base model. The license of the `pruna-engine` is [here](https://pypi.org/project/pruna-engine/) on Pypi.
81
 
82
  ## Want to compress other models?
83
 
84
  - Contact us and tell us which model to compress next [here](https://www.pruna.ai/contact).
85
+ - Request access to easily compress your own AI models [here](https://z0halsaff74.typeform.com/pruna-access?typeform-source=www.pruna.ai).
config.json CHANGED
@@ -1,47 +1,46 @@
1
  {
2
- "_name_or_path": "/covalent/.cache/models/tmpjx0icd9ti1ztb9qy",
3
- "architectures": [
4
- "Qwen2ForCausalLM"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  ],
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 151643,
8
- "eos_token_id": 151645,
9
- "hidden_act": "silu",
10
- "hidden_size": 2048,
11
- "initializer_range": 0.02,
12
- "intermediate_size": 11008,
13
- "max_position_embeddings": 32768,
14
- "max_window_layers": 70,
15
- "model_type": "qwen2",
16
- "num_attention_heads": 16,
17
- "num_hidden_layers": 36,
18
- "num_key_value_heads": 2,
19
- "quantization_config": {
20
- "_load_in_4bit": false,
21
- "_load_in_8bit": true,
22
- "bnb_4bit_compute_dtype": "bfloat16",
23
- "bnb_4bit_quant_storage": "uint8",
24
- "bnb_4bit_quant_type": "fp4",
25
- "bnb_4bit_use_double_quant": false,
26
- "llm_int8_enable_fp32_cpu_offload": false,
27
- "llm_int8_has_fp16_weight": false,
28
- "llm_int8_skip_modules": [
29
- "lm_head"
30
- ],
31
- "llm_int8_threshold": 6.0,
32
- "load_in_4bit": false,
33
- "load_in_8bit": true,
34
- "quant_method": "bitsandbytes"
35
- },
36
- "rms_norm_eps": 1e-06,
37
- "rope_scaling": null,
38
- "rope_theta": 1000000.0,
39
- "sliding_window": null,
40
- "tie_word_embeddings": true,
41
- "torch_dtype": "float16",
42
- "transformers_version": "4.46.2",
43
- "use_cache": true,
44
- "use_sliding_window": false,
45
- "vocab_size": 151936,
46
- "api_key": null
47
- }
 
1
  {
2
+ "_name_or_path": "/tmp/models/tmploet3pwbdsmvc02y",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 2048,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 11008,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 70,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 36,
18
+ "num_key_value_heads": 2,
19
+ "quantization_config": {
20
+ "_load_in_4bit": false,
21
+ "_load_in_8bit": true,
22
+ "bnb_4bit_compute_dtype": "bfloat16",
23
+ "bnb_4bit_quant_storage": "uint8",
24
+ "bnb_4bit_quant_type": "fp4",
25
+ "bnb_4bit_use_double_quant": false,
26
+ "llm_int8_enable_fp32_cpu_offload": false,
27
+ "llm_int8_has_fp16_weight": false,
28
+ "llm_int8_skip_modules": [
29
+ "lm_head"
30
  ],
31
+ "llm_int8_threshold": 6.0,
32
+ "load_in_4bit": false,
33
+ "load_in_8bit": true,
34
+ "quant_method": "bitsandbytes"
35
+ },
36
+ "rms_norm_eps": 1e-06,
37
+ "rope_scaling": null,
38
+ "rope_theta": 1000000.0,
39
+ "sliding_window": null,
40
+ "tie_word_embeddings": true,
41
+ "torch_dtype": "bfloat16",
42
+ "transformers_version": "4.48.2",
43
+ "use_cache": true,
44
+ "use_sliding_window": false,
45
+ "vocab_size": 151936
46
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
generation_config.json CHANGED
@@ -10,5 +10,5 @@
10
  "temperature": 0.7,
11
  "top_k": 20,
12
  "top_p": 0.8,
13
- "transformers_version": "4.46.2"
14
  }
 
10
  "temperature": 0.7,
11
  "top_k": 20,
12
  "top_p": 0.8,
13
+ "transformers_version": "4.48.2"
14
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01fcca5ec6c29d5eb71d9a6d849052698462315c502da99ecbaf24007fd54117
3
- size 3401576604
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77503f073d91dbccd784bf4bf0caf997a7c7d06375258430c9a6837fd0e495e5
3
+ size 3401576788
smash_config.json CHANGED
@@ -1,24 +1,11 @@
1
  {
2
- "comp_cgenerate_active": false,
3
- "comp_ctranslate_active": false,
4
- "comp_cwhisper_active": false,
5
- "comp_diffusers2_active": false,
6
- "comp_ifw_active": false,
7
- "comp_onediff_active": false,
8
- "comp_step_caching_active": false,
9
- "comp_torch_compile_active": false,
10
- "comp_ws2t_active": false,
11
- "comp_x-fast_active": false,
12
- "prune_torch-structured_active": false,
13
- "quant_aqlm_active": false,
14
- "quant_awq_active": false,
15
- "quant_gptq_active": false,
16
- "quant_half_active": false,
17
- "quant_hqq_active": false,
18
- "quant_llm-int8_active": true,
19
- "quant_quanto_active": false,
20
- "quant_torch_dynamic_active": false,
21
- "quant_torch_static_active": false,
22
  "quant_llm-int8_compute_dtype": "bfloat16",
23
  "quant_llm-int8_double_quant": false,
24
  "quant_llm-int8_enable_fp32_cpu_offload": false,
@@ -28,8 +15,9 @@
28
  "quant_llm-int8_weight_bits": 8,
29
  "max_batch_size": 1,
30
  "device": "cuda",
31
- "cache_dir": "/covalent/.cache/models/tmpjx0icd9t",
32
  "task": "",
33
- "save_load_fn": "bitsandbytes",
34
- "save_load_fn_args": {}
 
35
  }
 
1
  {
2
+ "batchers": null,
3
+ "cachers": null,
4
+ "compilers": null,
5
+ "distillers": null,
6
+ "pruners": null,
7
+ "quantizers": "llm-int8",
8
+ "recoverers": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  "quant_llm-int8_compute_dtype": "bfloat16",
10
  "quant_llm-int8_double_quant": false,
11
  "quant_llm-int8_enable_fp32_cpu_offload": false,
 
15
  "quant_llm-int8_weight_bits": 8,
16
  "max_batch_size": 1,
17
  "device": "cuda",
18
+ "cache_dir": "/tmp/models/tmploet3pwb",
19
  "task": "",
20
+ "save_load_fn": "llm-int8",
21
+ "save_load_fn_args": {},
22
+ "api_key": null
23
  }