Petro commited on
Commit
036f518
1 Parent(s): b932f3e
Files changed (2) hide show
  1. chat.py +44 -0
  2. main.py +14 -14
chat.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_cpp import Llama
2
+ from llama_cpp import ChatCompletionRequestMessage as Message
3
+ from llama_cpp import ChatCompletionRequestSystemMessage as SystemMessage
4
+ from llama_cpp import ChatCompletionRequestAssistantMessage as AssistantMessage
5
+ from llama_cpp import ChatCompletionRequestUserMessage as UserMessage
6
+
7
+
8
+ SYSTEM = 'system'
9
+ USER = 'user'
10
+ ASSISTANT = 'assistant'
11
+ EXIT = 'exit'
12
+ model_path = "zephyr-7b-beta.Q4_K_S.gguf"
13
+
14
+ llm = Llama(model_path=model_path, n_ctx=512, max_answer_len=100) # Set chat_format according to the model you are using
15
+
16
+
17
+ class Chat:
18
+ def __init__(self, model: Llama) -> None:
19
+ self.model: Llama = model
20
+ self.messages: list[Message] = [
21
+ SystemMessage(
22
+ role=SYSTEM,
23
+ content='You are a helpful developer assistant, answer all the questions correctly and concisely.'
24
+ ),
25
+ AssistantMessage(role=ASSISTANT, content='Hello, do you have any question?'),
26
+ ]
27
+
28
+ def send_message(self, content: str):
29
+ new_message = UserMessage(role=USER, content=content)
30
+ self.messages.append(new_message)
31
+
32
+ def generate_reply(self) -> str:
33
+ response = self.model.create_chat_completion(
34
+ messages=self.messages,
35
+ temperature=0.7,
36
+ top_p=0.9,
37
+ top_k=20,
38
+ max_tokens=128
39
+ )
40
+
41
+ response_content = response['choices'][0]['message']
42
+ self.messages.append(AssistantMessage(role=ASSISTANT, content=response_content))
43
+
44
+ return response_content
main.py CHANGED
@@ -1,26 +1,26 @@
1
- from ctransformers import AutoModelForCausalLM
2
  from fastapi import FastAPI
 
3
  from pydantic import BaseModel
4
 
5
- file_name = "zephyr-7b-beta.Q4_K_S.gguf"
6
- llm = AutoModelForCausalLM.from_pretrained(file_name,
7
- model_type='mistral',
8
- max_new_tokens=2096,
9
- threads=8000,
10
- )
 
11
 
12
- #Pydantic object
13
  class validation(BaseModel):
14
  prompt: str
15
- #Fast API
16
 
17
  app = FastAPI()
 
 
18
 
19
  @app.post("/llm_on_cpu")
20
  async def stream(item: validation):
21
- system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.'
22
- E_INST = "</s>"
23
- user, assistant = "<|user|>", "<|assistant|>"
24
- prompt = f"{system_prompt}{E_INST}\n{user}\n{item.prompt}{E_INST}\n{assistant}\n"
25
 
26
- return llm(prompt)
 
 
1
  from fastapi import FastAPI
2
+ from llama_cpp import Llama
3
  from pydantic import BaseModel
4
 
5
+ from chat import Chat
6
+
7
+ model_path = "zephyr-7b-beta.Q4_K_S.gguf"
8
+
9
+ llm = Llama(model_path=model_path, n_ctx=512, max_answer_len=100) # Set chat_format according to the model you are using
10
+
11
+
12
 
 
13
  class validation(BaseModel):
14
  prompt: str
15
+
16
 
17
  app = FastAPI()
18
+ chat = Chat(model=llm)
19
+
20
 
21
  @app.post("/llm_on_cpu")
22
  async def stream(item: validation):
23
+ chat.send_message(item.prompt)
24
+ response = chat.generate_reply()
 
 
25
 
26
+ return llm(response)