xsa-dev commited on
Commit
ec1a38e
·
1 Parent(s): 2489c2c

glances...

Browse files
Files changed (1) hide show
  1. app.py +29 -16
app.py CHANGED
@@ -4,17 +4,24 @@ from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download # load from huggingfaces
5
 
6
 
 
 
 
 
 
7
  llm = Llama(model_path=hf_hub_download(
8
- repo_id="TheBloke/Llama-2-7B-Chat-GGML",
9
- filename="llama-2-7b-chat.ggmlv3.q4_1.bin"), n_ctx=2048) # download model from hf/ n_ctx=2048 for high ccontext length
10
- history = []
 
 
11
 
12
- pre_prompt = " The user and the AI are having a conversation : <|endoftext|> \n "
 
 
13
 
14
 
15
  def generate_text(input_text, history):
16
- print("history ", history)
17
- print("input ", input_text)
18
  temp = ""
19
  if history == []:
20
  input_text_with_history = f"SYSTEM:{pre_prompt}" + \
@@ -22,9 +29,10 @@ def generate_text(input_text, history):
22
  else:
23
  input_text_with_history = f"{history[-1][1]}" + "\n"
24
  input_text_with_history += f"USER: {input_text}" + "\n" + " ASSISTANT:"
25
- print("new input", input_text_with_history)
26
  output = llm(input_text_with_history, max_tokens=1024, stop=[
27
- "<|prompter|>", "<|endoftext|>", "<|endoftext|> \n", "ASSISTANT:", "USER:", "SYSTEM:"], stream=True)
 
 
28
  for out in output:
29
  stream = copy.deepcopy(out)
30
  print(stream["choices"][0]["text"])
@@ -35,13 +43,18 @@ def generate_text(input_text, history):
35
 
36
 
37
  demo = gr.ChatInterface(generate_text,
38
- title="LLM on CPU",
39
- description="Running LLM with https://github.com/abetlen/llama-cpp-python. btw the text streaming thing was the hardest thing to impliment",
40
- examples=["Hello", "Am I cool?",
41
- "Are tomatoes vegetables?"],
 
 
 
 
42
  cache_examples=True,
43
- retry_btn=None,
44
- undo_btn="Delete Previous",
45
- clear_btn="Clear",)
46
- demo.queue(concurrency_count=1, max_size=5)
 
47
  demo.launch()
 
4
  from huggingface_hub import hf_hub_download # load from huggingfaces
5
 
6
 
7
+ CONST_REPO_ID = "TheBloke/Llama-2-7B-Chat-GGML"
8
+ CONST_FILENAME = "llama-2-7b-chat.ggmlv3.q4_1.bin"
9
+
10
+ N_CTX = 2048
11
+
12
  llm = Llama(model_path=hf_hub_download(
13
+ repo_id=CONST_REPO_ID,
14
+ filename=CONST_FILENAME),
15
+ n_ctx=2048
16
+ )
17
+ history = N_CTX
18
 
19
+
20
+ pre_prompt = \
21
+ " The user and the AI are having a conversation : <|endoftext|> \n"
22
 
23
 
24
  def generate_text(input_text, history):
 
 
25
  temp = ""
26
  if history == []:
27
  input_text_with_history = f"SYSTEM:{pre_prompt}" + \
 
29
  else:
30
  input_text_with_history = f"{history[-1][1]}" + "\n"
31
  input_text_with_history += f"USER: {input_text}" + "\n" + " ASSISTANT:"
 
32
  output = llm(input_text_with_history, max_tokens=1024, stop=[
33
+ "<|prompter|>", "<|endoftext|>", "<|endoftext|> \n",
34
+ "ASSISTANT:", "USER:", "SYSTEM:"], stream=True
35
+ )
36
  for out in output:
37
  stream = copy.deepcopy(out)
38
  print(stream["choices"][0]["text"])
 
43
 
44
 
45
  demo = gr.ChatInterface(generate_text,
46
+ title=f"Lama2 on CPU: {CONST_FILENAME}",
47
+ description=f"Running Llama2 with llama_cpp: \
48
+ \r\n<i>{CONST_REPO_ID} {CONST_FILENAME}</i>",
49
+ examples=["Hi!",
50
+ "Does it hard to be machine?",
51
+ "When i am need a doctor?",
52
+ "Ты говоришь по русски? Я злой."
53
+ ],
54
  cache_examples=True,
55
+ retry_btn="Retry",
56
+ undo_btn="Undo",
57
+ clear_btn="Clear")
58
+
59
+ demo.queue(concurrency_count=10, max_size=50)
60
  demo.launch()