Spaces:
Sleeping
Sleeping
change method to Llama.from_pretrained
Browse filesto load the model directly from the HF repository
app.py
CHANGED
@@ -74,12 +74,14 @@ def genRANstring(n):
|
|
74 |
def create_chat():
|
75 |
# Set HF API token and HF repo
|
76 |
from llama_cpp import Llama
|
77 |
-
modelfile = hf_hub_download(
|
78 |
-
|
79 |
-
|
80 |
-
)
|
81 |
-
client = Llama(
|
82 |
-
|
|
|
|
|
83 |
#n_gpu_layers=-1, #enable GPU
|
84 |
n_threads =2,
|
85 |
temperature=0.24,
|
@@ -90,7 +92,7 @@ def create_chat():
|
|
90 |
flash_attn=True,
|
91 |
verbose=verbosity,
|
92 |
)
|
93 |
-
print('loading gemma-2-2b-it-
|
94 |
return client
|
95 |
|
96 |
|
|
|
74 |
def create_chat():
|
75 |
# Set HF API token and HF repo
|
76 |
from llama_cpp import Llama
|
77 |
+
#modelfile = hf_hub_download(
|
78 |
+
# repo_id=os.environ.get("REPO_ID", "bartowski/gemma-2-2b-it-GGUF"),
|
79 |
+
# filename=os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q5_K_M.gguf"),
|
80 |
+
#)
|
81 |
+
client = Llama.from_pretrained(
|
82 |
+
repo_id="bartowski/gemma-2-2b-it-GGUF",
|
83 |
+
filename="gemma-2-2b-it-Q4_K_S.gguf",
|
84 |
+
#model_path=modelfile,
|
85 |
#n_gpu_layers=-1, #enable GPU
|
86 |
n_threads =2,
|
87 |
temperature=0.24,
|
|
|
92 |
flash_attn=True,
|
93 |
verbose=verbosity,
|
94 |
)
|
95 |
+
print('loading gemma-2-2b-it-Q4_K_S.gguf with LlamaCPP...')
|
96 |
return client
|
97 |
|
98 |
|