Spaces:

FM-1976
/

Gemma2-2B-Reflection

Sleeping

FM-1976 commited on Sep 18, 2024

Commit

e682338

verified ·

1 Parent(s): 5ba8f3e

change method to Llama.from_pretrained

to load the model directly from the HF repository

Files changed (1) hide show

app.py CHANGED Viewed

@@ -74,12 +74,14 @@ def genRANstring(n):
 def create_chat():
 # Set HF API token  and HF repo
     from llama_cpp import Llama
-    modelfile = hf_hub_download(
-        repo_id=os.environ.get("REPO_ID", "bartowski/gemma-2-2b-it-GGUF"),
-        filename=os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q5_K_M.gguf"),
-    )
-    client = Llama(
-                model_path=modelfile,
                 #n_gpu_layers=-1,  #enable GPU
                 n_threads =2,
                 temperature=0.24,
@@ -90,7 +92,7 @@ def create_chat():
                 flash_attn=True,
                 verbose=verbosity,
                 )
-    print('loading gemma-2-2b-it-Q5_K_M.gguf with LlamaCPP...')
     return client

 def create_chat():
 # Set HF API token  and HF repo
     from llama_cpp import Llama
+    #modelfile = hf_hub_download(
+    #    repo_id=os.environ.get("REPO_ID", "bartowski/gemma-2-2b-it-GGUF"),
+    #    filename=os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q5_K_M.gguf"),
+    #)
+    client = Llama.from_pretrained(
+                repo_id="bartowski/gemma-2-2b-it-GGUF",
+                filename="gemma-2-2b-it-Q4_K_S.gguf",
+                #model_path=modelfile,
                 #n_gpu_layers=-1,  #enable GPU
                 n_threads =2,
                 temperature=0.24,
                 flash_attn=True,
                 verbose=verbosity,
                 )
+    print('loading gemma-2-2b-it-Q4_K_S.gguf with LlamaCPP...')
     return client