Spaces:

xsa-dev
/

llama2-7b-llama_cpp-ggmlv3-q4_1

Runtime error

App Files Files Community

xsa-dev commited on Aug 14, 2023

Commit

ec1a38e

1 Parent(s): 2489c2c

glances...

Browse files

Files changed (1) hide show

app.py +29 -16

app.py CHANGED Viewed

@@ -4,17 +4,24 @@ from llama_cpp import Llama
 from huggingface_hub import hf_hub_download  # load from huggingfaces
 llm = Llama(model_path=hf_hub_download(
-    repo_id="TheBloke/Llama-2-7B-Chat-GGML",
-    filename="llama-2-7b-chat.ggmlv3.q4_1.bin"), n_ctx=2048)  # download model from hf/ n_ctx=2048 for high ccontext length
-history = []
-pre_prompt = " The user and the AI are having a conversation : <|endoftext|> \n "
 def generate_text(input_text, history):
-    print("history ", history)
-    print("input ", input_text)
     temp = ""
     if history == []:
         input_text_with_history = f"SYSTEM:{pre_prompt}" + \
@@ -22,9 +29,10 @@ def generate_text(input_text, history):
     else:
         input_text_with_history = f"{history[-1][1]}" + "\n"
         input_text_with_history += f"USER: {input_text}" + "\n" + " ASSISTANT:"
-    print("new input", input_text_with_history)
     output = llm(input_text_with_history, max_tokens=1024, stop=[
-                 "<|prompter|>", "<|endoftext|>", "<|endoftext|> \n", "ASSISTANT:", "USER:", "SYSTEM:"], stream=True)
     for out in output:
         stream = copy.deepcopy(out)
         print(stream["choices"][0]["text"])
@@ -35,13 +43,18 @@ def generate_text(input_text, history):
 demo = gr.ChatInterface(generate_text,
-                        title="LLM on CPU",
-                        description="Running LLM with https://github.com/abetlen/llama-cpp-python. btw the text streaming thing was the hardest thing to impliment",
-                        examples=["Hello", "Am I cool?",
-                                  "Are tomatoes vegetables?"],
                         cache_examples=True,
-                        retry_btn=None,
-                        undo_btn="Delete Previous",
-                        clear_btn="Clear",)
-demo.queue(concurrency_count=1, max_size=5)
 demo.launch()

 from huggingface_hub import hf_hub_download  # load from huggingfaces
+CONST_REPO_ID = "TheBloke/Llama-2-7B-Chat-GGML"
+CONST_FILENAME = "llama-2-7b-chat.ggmlv3.q4_1.bin"
+N_CTX = 2048
 llm = Llama(model_path=hf_hub_download(
+    repo_id=CONST_REPO_ID,
+    filename=CONST_FILENAME),
+    n_ctx=2048
+)
+history = N_CTX
+pre_prompt = \
+    " The user and the AI are having a conversation : <|endoftext|> \n"
 def generate_text(input_text, history):
     temp = ""
     if history == []:
         input_text_with_history = f"SYSTEM:{pre_prompt}" + \
     else:
         input_text_with_history = f"{history[-1][1]}" + "\n"
         input_text_with_history += f"USER: {input_text}" + "\n" + " ASSISTANT:"
     output = llm(input_text_with_history, max_tokens=1024, stop=[
+        "<|prompter|>", "<|endoftext|>", "<|endoftext|> \n",
+        "ASSISTANT:", "USER:", "SYSTEM:"], stream=True
+    )
     for out in output:
         stream = copy.deepcopy(out)
         print(stream["choices"][0]["text"])
 demo = gr.ChatInterface(generate_text,
+                        title=f"Lama2 on CPU: {CONST_FILENAME}",
+                        description=f"Running Llama2 with llama_cpp: \
+                               \r\n<i>{CONST_REPO_ID} {CONST_FILENAME}</i>",
+                        examples=["Hi!",
+                                  "Does it hard to be machine?",
+                                  "When i am need a doctor?",
+                                  "Ты говоришь по русски? Я злой."
+                                  ],
                         cache_examples=True,
+                        retry_btn="Retry",
+                        undo_btn="Undo",
+                        clear_btn="Clear")
+demo.queue(concurrency_count=10, max_size=50)
 demo.launch()