Spaces:
Sleeping
Sleeping
Update app.py
Browse filesmoved modelfile to
@cache
, add flash attention and number of threads=2
app.py
CHANGED
@@ -25,11 +25,6 @@ st.set_page_config(
|
|
25 |
page_icon="π",
|
26 |
layout="wide")
|
27 |
|
28 |
-
if "modelfile" not in st.session_state:
|
29 |
-
st.session_state.modelfile = hf_hub_download(
|
30 |
-
repo_id=os.environ.get("REPO_ID", "bartowski/gemma-2-2b-it-GGUF"),
|
31 |
-
filename=os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q5_K_M.gguf"),
|
32 |
-
)
|
33 |
|
34 |
if "hf_model" not in st.session_state:
|
35 |
st.session_state.hf_model = "Gemma2-2B-it"
|
@@ -79,14 +74,20 @@ def genRANstring(n):
|
|
79 |
def create_chat():
|
80 |
# Set HF API token and HF repo
|
81 |
from llama_cpp import Llama
|
|
|
|
|
|
|
|
|
82 |
client = Llama(
|
83 |
-
model_path=
|
84 |
#n_gpu_layers=-1, #enable GPU
|
|
|
85 |
temperature=0.24,
|
86 |
n_ctx=nCTX,
|
87 |
max_tokens=600,
|
88 |
repeat_penalty=1.176,
|
89 |
stop=sTOPS,
|
|
|
90 |
verbose=verbosity,
|
91 |
)
|
92 |
print('loading gemma-2-2b-it-Q5_K_M.gguf with LlamaCPP...')
|
|
|
25 |
page_icon="π",
|
26 |
layout="wide")
|
27 |
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
if "hf_model" not in st.session_state:
|
30 |
st.session_state.hf_model = "Gemma2-2B-it"
|
|
|
74 |
def create_chat():
|
75 |
# Set HF API token and HF repo
|
76 |
from llama_cpp import Llama
|
77 |
+
modelfile = hf_hub_download(
|
78 |
+
repo_id=os.environ.get("REPO_ID", "bartowski/gemma-2-2b-it-GGUF"),
|
79 |
+
filename=os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q5_K_M.gguf"),
|
80 |
+
)
|
81 |
client = Llama(
|
82 |
+
model_path=modelfile,
|
83 |
#n_gpu_layers=-1, #enable GPU
|
84 |
+
n_threads =2,
|
85 |
temperature=0.24,
|
86 |
n_ctx=nCTX,
|
87 |
max_tokens=600,
|
88 |
repeat_penalty=1.176,
|
89 |
stop=sTOPS,
|
90 |
+
flash_attn=True,
|
91 |
verbose=verbosity,
|
92 |
)
|
93 |
print('loading gemma-2-2b-it-Q5_K_M.gguf with LlamaCPP...')
|