xsa-dev commited on
Commit
55c0bb6
·
1 Parent(s): 6bf19a4
Files changed (1) hide show
  1. app.py +49 -20
app.py CHANGED
@@ -1,21 +1,50 @@
1
  import gradio as gr
2
- import os
3
-
4
- title = "LLAMA2"
5
- description = "LLAMA2"
6
- article = "<p style='text-align: center'>test ...</p>"
7
- examples = [
8
- ["Test message 1"],
9
- ["What you can?"],
10
- ["Гладкая Бореальная низина на северном полушарии занимает 40%"],
11
- ]
12
- gr.load(
13
- "huggingface/meta-llama/Llama-2-7b-chat-hf",
14
- inputs=gr.Textbox(lines=5, label="Входной текст"),
15
- title=title,
16
- description=description,
17
- article=article,
18
- examples=examples,
19
- enable_queue=True,
20
- api_key=os.getenv('api_key')
21
- ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import copy
3
+ import time
4
+ import ctypes # to run on C api directly
5
+ import llama_cpp
6
+ from llama_cpp import Llama
7
+ from huggingface_hub import hf_hub_download # load from huggingfaces
8
+
9
+
10
+ llm = Llama(model_path=hf_hub_download(
11
+ repo_id="TheBloke/Llama-2-7B-Chat-GGML",
12
+ filename="llama-2-7b-chat.ggmlv3.q4_1.bin"), n_ctx=2048) # download model from hf/ n_ctx=2048 for high ccontext length
13
+ history = []
14
+
15
+ pre_prompt = " The user and the AI are having a conversation : <|endoftext|> \n "
16
+
17
+
18
+ def generate_text(input_text, history):
19
+ print("history ", history)
20
+ print("input ", input_text)
21
+ temp = ""
22
+ if history == []:
23
+ input_text_with_history = f"SYSTEM:{pre_prompt}" + \
24
+ "\n" + f"USER: {input_text} " + "\n" + " ASSISTANT:"
25
+ else:
26
+ input_text_with_history = f"{history[-1][1]}" + "\n"
27
+ input_text_with_history += f"USER: {input_text}" + "\n" + " ASSISTANT:"
28
+ print("new input", input_text_with_history)
29
+ output = llm(input_text_with_history, max_tokens=1024, stop=[
30
+ "<|prompter|>", "<|endoftext|>", "<|endoftext|> \n", "ASSISTANT:", "USER:", "SYSTEM:"], stream=True)
31
+ for out in output:
32
+ stream = copy.deepcopy(out)
33
+ print(stream["choices"][0]["text"])
34
+ temp += stream["choices"][0]["text"]
35
+ yield temp
36
+
37
+ history = ["init", input_text_with_history]
38
+
39
+
40
+ demo = gr.ChatInterface(generate_text,
41
+ title="LLM on CPU",
42
+ description="Running LLM with https://github.com/abetlen/llama-cpp-python. btw the text streaming thing was the hardest thing to impliment",
43
+ examples=["Hello", "Am I cool?",
44
+ "Are tomatoes vegetables?"],
45
+ cache_examples=True,
46
+ retry_btn=None,
47
+ undo_btn="Delete Previous",
48
+ clear_btn="Clear",)
49
+ demo.queue(concurrency_count=1, max_size=5)
50
+ demo.launch()