FM-1976 commited on
Commit
5ba8f3e
Β·
verified Β·
1 Parent(s): 51eba2d

Update app.py

Browse files

moved modelfile to

@cache
, add flash attention and number of threads=2

Files changed (1) hide show
  1. app.py +7 -6
app.py CHANGED
@@ -25,11 +25,6 @@ st.set_page_config(
25
  page_icon="🌟",
26
  layout="wide")
27
 
28
- if "modelfile" not in st.session_state:
29
- st.session_state.modelfile = hf_hub_download(
30
- repo_id=os.environ.get("REPO_ID", "bartowski/gemma-2-2b-it-GGUF"),
31
- filename=os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q5_K_M.gguf"),
32
- )
33
 
34
  if "hf_model" not in st.session_state:
35
  st.session_state.hf_model = "Gemma2-2B-it"
@@ -79,14 +74,20 @@ def genRANstring(n):
79
  def create_chat():
80
  # Set HF API token and HF repo
81
  from llama_cpp import Llama
 
 
 
 
82
  client = Llama(
83
- model_path=st.session_state.modelfile,
84
  #n_gpu_layers=-1, #enable GPU
 
85
  temperature=0.24,
86
  n_ctx=nCTX,
87
  max_tokens=600,
88
  repeat_penalty=1.176,
89
  stop=sTOPS,
 
90
  verbose=verbosity,
91
  )
92
  print('loading gemma-2-2b-it-Q5_K_M.gguf with LlamaCPP...')
 
25
  page_icon="🌟",
26
  layout="wide")
27
 
 
 
 
 
 
28
 
29
  if "hf_model" not in st.session_state:
30
  st.session_state.hf_model = "Gemma2-2B-it"
 
74
  def create_chat():
75
  # Set HF API token and HF repo
76
  from llama_cpp import Llama
77
+ modelfile = hf_hub_download(
78
+ repo_id=os.environ.get("REPO_ID", "bartowski/gemma-2-2b-it-GGUF"),
79
+ filename=os.environ.get("MODEL_FILE", "gemma-2-2b-it-Q5_K_M.gguf"),
80
+ )
81
  client = Llama(
82
+ model_path=modelfile,
83
  #n_gpu_layers=-1, #enable GPU
84
+ n_threads =2,
85
  temperature=0.24,
86
  n_ctx=nCTX,
87
  max_tokens=600,
88
  repeat_penalty=1.176,
89
  stop=sTOPS,
90
+ flash_attn=True,
91
  verbose=verbosity,
92
  )
93
  print('loading gemma-2-2b-it-Q5_K_M.gguf with LlamaCPP...')