Update README.md
Browse files
README.md
CHANGED
@@ -141,13 +141,14 @@ llm = Llama.from_pretrained(
|
|
141 |
filename="*Q4_K_M.gguf", # suffix of the filename containing the level of quantization.
|
142 |
n_ctx=32768, # The max sequence length to use - note that longer sequence lengths require much more resources
|
143 |
n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
|
144 |
-
n_gpu_layers=
|
145 |
)
|
146 |
|
147 |
# Simple inference example
|
148 |
output = llm(
|
149 |
"""<s><|im_start|> user
|
150 |
-
Hva kan jeg bruke einstape til?<|im_end
|
|
|
151 |
""", # Prompt
|
152 |
max_tokens=512, # Generate up to 512 tokens
|
153 |
stop=["<|im_end|>"], # Example stop token
|
@@ -161,7 +162,7 @@ llm.create_chat_completion(
|
|
161 |
messages = [
|
162 |
{
|
163 |
"role": "user",
|
164 |
-
"content": Hva kan jeg bruke einstape til?"
|
165 |
}
|
166 |
]
|
167 |
)
|
|
|
141 |
filename="*Q4_K_M.gguf", # suffix of the filename containing the level of quantization.
|
142 |
n_ctx=32768, # The max sequence length to use - note that longer sequence lengths require much more resources
|
143 |
n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
|
144 |
+
n_gpu_layers=35 # The number of layers to offload to GPU, if you have GPU acceleration available
|
145 |
)
|
146 |
|
147 |
# Simple inference example
|
148 |
output = llm(
|
149 |
"""<s><|im_start|> user
|
150 |
+
Hva kan jeg bruke einstape til?<|im_end|>
|
151 |
+
<|im_start|> assistant
|
152 |
""", # Prompt
|
153 |
max_tokens=512, # Generate up to 512 tokens
|
154 |
stop=["<|im_end|>"], # Example stop token
|
|
|
162 |
messages = [
|
163 |
{
|
164 |
"role": "user",
|
165 |
+
"content": "Hva kan jeg bruke einstape til?"
|
166 |
}
|
167 |
]
|
168 |
)
|