kittn commited on
Commit
a9e321a
·
1 Parent(s): 3a33745

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +14 -10
README.md CHANGED
@@ -11,14 +11,18 @@ import torch
11
  from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline, TextStreamer
12
 
13
  tokenizer = LlamaTokenizer.from_pretrained("kittn/mistral-7B-v0.1-hf")
14
- model = LlamaForCausalLM.from_pretrained("kittn/mistral-7B-v0.1-hf", torch_dtype=torch.bfloat16, device_map={"": "0"})
 
 
 
 
15
 
16
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
17
 
18
  pipe("Hi, my name", streamer=TextStreamer(tokenizer), max_new_tokens=128)
19
  ```
20
 
21
- ### Load in bitsandbytes int8 (8GB VRAM or higher)
22
 
23
  ```python
24
  import torch
@@ -27,9 +31,12 @@ from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline, TextStreame
27
  tokenizer = LlamaTokenizer.from_pretrained("kittn/mistral-7B-v0.1-hf")
28
  model = LlamaForCausalLM.from_pretrained(
29
  "kittn/mistral-7B-v0.1-hf",
30
- device_map={"": "0"},
31
  quantization_config=BitsAndBytesConfig(
32
- load_in_8bit=True,
 
 
 
33
  ),
34
  )
35
 
@@ -38,7 +45,7 @@ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
38
  pipe("Hi, my name", streamer=TextStreamer(tokenizer), max_new_tokens=128)
39
  ```
40
 
41
- ### Load in bitsandbytes nf4 (6GB VRAM or higher)
42
 
43
  ```python
44
  import torch
@@ -47,12 +54,9 @@ from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline, TextStreame
47
  tokenizer = LlamaTokenizer.from_pretrained("kittn/mistral-7B-v0.1-hf")
48
  model = LlamaForCausalLM.from_pretrained(
49
  "kittn/mistral-7B-v0.1-hf",
50
- device_map={"": "0"},
51
  quantization_config=BitsAndBytesConfig(
52
- load_in_4bit=True,
53
- bnb_4bit_compute_dtype=torch.float16,
54
- bnb_4bit_quant_type="nf4",
55
- bnb_4bit_use_double_quant=False, # set to True to save more VRAM at the cost of some speed/accuracy
56
  ),
57
  )
58
 
 
11
  from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline, TextStreamer
12
 
13
  tokenizer = LlamaTokenizer.from_pretrained("kittn/mistral-7B-v0.1-hf")
14
+ model = LlamaForCausalLM.from_pretrained(
15
+ "kittn/mistral-7B-v0.1-hf",
16
+ torch_dtype=torch.bfloat16,
17
+ device_map={"": 0}
18
+ )
19
 
20
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
21
 
22
  pipe("Hi, my name", streamer=TextStreamer(tokenizer), max_new_tokens=128)
23
  ```
24
 
25
+ ### Load in bitsandbytes nf4 (6GB VRAM or higher, maybe less with double_quant)
26
 
27
  ```python
28
  import torch
 
31
  tokenizer = LlamaTokenizer.from_pretrained("kittn/mistral-7B-v0.1-hf")
32
  model = LlamaForCausalLM.from_pretrained(
33
  "kittn/mistral-7B-v0.1-hf",
34
+ device_map={"": 0},
35
  quantization_config=BitsAndBytesConfig(
36
+ load_in_4bit=True,
37
+ bnb_4bit_compute_dtype=torch.float16,
38
+ bnb_4bit_quant_type="nf4",
39
+ bnb_4bit_use_double_quant=False, # set to True to save more VRAM at the cost of some speed/accuracy
40
  ),
41
  )
42
 
 
45
  pipe("Hi, my name", streamer=TextStreamer(tokenizer), max_new_tokens=128)
46
  ```
47
 
48
+ ### Load in bitsandbytes int8 (8GB VRAM or higher). Quite slow; not recommended.
49
 
50
  ```python
51
  import torch
 
54
  tokenizer = LlamaTokenizer.from_pretrained("kittn/mistral-7B-v0.1-hf")
55
  model = LlamaForCausalLM.from_pretrained(
56
  "kittn/mistral-7B-v0.1-hf",
57
+ device_map={"": 0},
58
  quantization_config=BitsAndBytesConfig(
59
+ load_in_8bit=True,
 
 
 
60
  ),
61
  )
62