Update README.md
Browse files
README.md
CHANGED
@@ -243,7 +243,7 @@ from llama_cpp import Llama
|
|
243 |
|
244 |
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
|
245 |
llm = Llama(
|
246 |
-
model_path="./
|
247 |
n_ctx=2048, # The max sequence length to use - note that longer sequence lengths require much more resources
|
248 |
n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
|
249 |
n_gpu_layers=35 # The number of layers to offload to GPU, if you have GPU acceleration available
|
@@ -251,7 +251,7 @@ llm = Llama(
|
|
251 |
|
252 |
# Simple inference example
|
253 |
output = llm(
|
254 |
-
"
|
255 |
max_tokens=512, # Generate up to 512 tokens
|
256 |
stop=["</s>"], # Example stop token - not necessarily correct for this specific model! Please check before using.
|
257 |
echo=True # Whether to echo the prompt
|
@@ -259,7 +259,7 @@ output = llm(
|
|
259 |
|
260 |
# Chat Completion API
|
261 |
|
262 |
-
llm = Llama(model_path="./
|
263 |
llm.create_chat_completion(
|
264 |
messages = [
|
265 |
{"role": "system", "content": "You are a story writing assistant."},
|
|
|
243 |
|
244 |
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
|
245 |
llm = Llama(
|
246 |
+
model_path="./NT-Java-1.1B_Q4_K_M.gguf", # Download the model file first
|
247 |
n_ctx=2048, # The max sequence length to use - note that longer sequence lengths require much more resources
|
248 |
n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
|
249 |
n_gpu_layers=35 # The number of layers to offload to GPU, if you have GPU acceleration available
|
|
|
251 |
|
252 |
# Simple inference example
|
253 |
output = llm(
|
254 |
+
"{prompt}", # Prompt
|
255 |
max_tokens=512, # Generate up to 512 tokens
|
256 |
stop=["</s>"], # Example stop token - not necessarily correct for this specific model! Please check before using.
|
257 |
echo=True # Whether to echo the prompt
|
|
|
259 |
|
260 |
# Chat Completion API
|
261 |
|
262 |
+
llm = Llama(model_path="./NT-Java-1.1B_Q4_K_M.gguf", chat_format="llama-2") # Set chat_format according to the model you are using
|
263 |
llm.create_chat_completion(
|
264 |
messages = [
|
265 |
{"role": "system", "content": "You are a story writing assistant."},
|