FM-1976 commited on
Commit
e682338
1 Parent(s): 5ba8f3e

change method to Llama.from_pretrained

Browse files

to load the model directly from the HF repository

Files changed (1) hide show
  1. app.py +9 -7
app.py CHANGED
@@ -74,12 +74,14 @@ def genRANstring(n):
74
  def create_chat():
75
  # Set HF API token and HF repo
76
  from llama_cpp import Llama
77
- modelfile = hf_hub_download(
78
- repo_id=os.environ.get("REPO_ID", "bartowski/gemma-2-2b-it-GGUF"),
79
- filename=os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q5_K_M.gguf"),
80
- )
81
- client = Llama(
82
- model_path=modelfile,
 
 
83
  #n_gpu_layers=-1, #enable GPU
84
  n_threads =2,
85
  temperature=0.24,
@@ -90,7 +92,7 @@ def create_chat():
90
  flash_attn=True,
91
  verbose=verbosity,
92
  )
93
- print('loading gemma-2-2b-it-Q5_K_M.gguf with LlamaCPP...')
94
  return client
95
 
96
 
 
74
  def create_chat():
75
  # Set HF API token and HF repo
76
  from llama_cpp import Llama
77
+ #modelfile = hf_hub_download(
78
+ # repo_id=os.environ.get("REPO_ID", "bartowski/gemma-2-2b-it-GGUF"),
79
+ # filename=os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q5_K_M.gguf"),
80
+ #)
81
+ client = Llama.from_pretrained(
82
+ repo_id="bartowski/gemma-2-2b-it-GGUF",
83
+ filename="gemma-2-2b-it-Q4_K_S.gguf",
84
+ #model_path=modelfile,
85
  #n_gpu_layers=-1, #enable GPU
86
  n_threads =2,
87
  temperature=0.24,
 
92
  flash_attn=True,
93
  verbose=verbosity,
94
  )
95
+ print('loading gemma-2-2b-it-Q4_K_S.gguf with LlamaCPP...')
96
  return client
97
 
98