kittn
/

mistral-7B-v0.1-hf

@@ -11,14 +11,18 @@ import torch
 from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline, TextStreamer
 tokenizer = LlamaTokenizer.from_pretrained("kittn/mistral-7B-v0.1-hf")
-model = LlamaForCausalLM.from_pretrained("kittn/mistral-7B-v0.1-hf", torch_dtype=torch.bfloat16, device_map={"": "0"})
 pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
 pipe("Hi, my name", streamer=TextStreamer(tokenizer), max_new_tokens=128)
 ```
-### Load in bitsandbytes int8 (8GB VRAM or higher)
 ```python
 import torch
@@ -27,9 +31,12 @@ from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline, TextStreame
 tokenizer = LlamaTokenizer.from_pretrained("kittn/mistral-7B-v0.1-hf")
 model = LlamaForCausalLM.from_pretrained(
     "kittn/mistral-7B-v0.1-hf",
-    device_map={"": "0"},
     quantization_config=BitsAndBytesConfig(
-        load_in_8bit=True,
     ),
 )
@@ -38,7 +45,7 @@ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
 pipe("Hi, my name", streamer=TextStreamer(tokenizer), max_new_tokens=128)
 ```
-### Load in bitsandbytes nf4 (6GB VRAM or higher)
 ```python
 import torch
@@ -47,12 +54,9 @@ from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline, TextStreame
 tokenizer = LlamaTokenizer.from_pretrained("kittn/mistral-7B-v0.1-hf")
 model = LlamaForCausalLM.from_pretrained(
     "kittn/mistral-7B-v0.1-hf",
-    device_map={"": "0"},
     quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_compute_dtype=torch.float16,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_use_double_quant=False, # set to True to save more VRAM at the cost of some speed/accuracy
     ),
 )

 from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline, TextStreamer
 tokenizer = LlamaTokenizer.from_pretrained("kittn/mistral-7B-v0.1-hf")
+model = LlamaForCausalLM.from_pretrained(
+    "kittn/mistral-7B-v0.1-hf",
+    torch_dtype=torch.bfloat16,
+    device_map={"": 0}
+)
 pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
 pipe("Hi, my name", streamer=TextStreamer(tokenizer), max_new_tokens=128)
 ```
+### Load in bitsandbytes nf4 (6GB VRAM or higher, maybe less with double_quant)
 ```python
 import torch
 tokenizer = LlamaTokenizer.from_pretrained("kittn/mistral-7B-v0.1-hf")
 model = LlamaForCausalLM.from_pretrained(
     "kittn/mistral-7B-v0.1-hf",
+    device_map={"": 0},
     quantization_config=BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=False, # set to True to save more VRAM at the cost of some speed/accuracy
     ),
 )
 pipe("Hi, my name", streamer=TextStreamer(tokenizer), max_new_tokens=128)
 ```
+### Load in bitsandbytes int8 (8GB VRAM or higher). Quite slow; not recommended.
 ```python
 import torch
 tokenizer = LlamaTokenizer.from_pretrained("kittn/mistral-7B-v0.1-hf")
 model = LlamaForCausalLM.from_pretrained(
     "kittn/mistral-7B-v0.1-hf",
+    device_map={"": 0},
     quantization_config=BitsAndBytesConfig(
+        load_in_8bit=True,
     ),
 )