rajabmondal commited on
Commit
fdf1a81
·
verified ·
1 Parent(s): c45b55d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +3 -3
README.md CHANGED
@@ -243,7 +243,7 @@ from llama_cpp import Llama
243
 
244
  # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
245
  llm = Llama(
246
- model_path="./mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf", # Download the model file first
247
  n_ctx=2048, # The max sequence length to use - note that longer sequence lengths require much more resources
248
  n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
249
  n_gpu_layers=35 # The number of layers to offload to GPU, if you have GPU acceleration available
@@ -251,7 +251,7 @@ llm = Llama(
251
 
252
  # Simple inference example
253
  output = llm(
254
- "[INST] {prompt} [/INST]", # Prompt
255
  max_tokens=512, # Generate up to 512 tokens
256
  stop=["</s>"], # Example stop token - not necessarily correct for this specific model! Please check before using.
257
  echo=True # Whether to echo the prompt
@@ -259,7 +259,7 @@ output = llm(
259
 
260
  # Chat Completion API
261
 
262
- llm = Llama(model_path="./mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf", chat_format="llama-2") # Set chat_format according to the model you are using
263
  llm.create_chat_completion(
264
  messages = [
265
  {"role": "system", "content": "You are a story writing assistant."},
 
243
 
244
  # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
245
  llm = Llama(
246
+ model_path="./NT-Java-1.1B_Q4_K_M.gguf", # Download the model file first
247
  n_ctx=2048, # The max sequence length to use - note that longer sequence lengths require much more resources
248
  n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
249
  n_gpu_layers=35 # The number of layers to offload to GPU, if you have GPU acceleration available
 
251
 
252
  # Simple inference example
253
  output = llm(
254
+ "{prompt}", # Prompt
255
  max_tokens=512, # Generate up to 512 tokens
256
  stop=["</s>"], # Example stop token - not necessarily correct for this specific model! Please check before using.
257
  echo=True # Whether to echo the prompt
 
259
 
260
  # Chat Completion API
261
 
262
+ llm = Llama(model_path="./NT-Java-1.1B_Q4_K_M.gguf", chat_format="llama-2") # Set chat_format according to the model you are using
263
  llm.create_chat_completion(
264
  messages = [
265
  {"role": "system", "content": "You are a story writing assistant."},