Sergidev commited on
Commit
be41fb7
·
verified ·
1 Parent(s): 08bf4ea
Files changed (1) hide show
  1. modules/pmbl.py +14 -43
modules/pmbl.py CHANGED
@@ -1,14 +1,20 @@
1
  import sqlite3
2
  from datetime import datetime
3
  from ctransformers import AutoModelForCausalLM
4
- from concurrent.futures import ThreadPoolExecutor
5
 
6
  class PMBL:
7
  def __init__(self, model_path, gpu_layers=50):
8
  self.model_path = model_path
9
  self.gpu_layers = gpu_layers
10
  self.init_db()
11
- self.executor = ThreadPoolExecutor(max_workers=6)
 
 
 
 
 
 
12
 
13
  def init_db(self):
14
  conn = sqlite3.connect('chat_history.db')
@@ -83,7 +89,7 @@ class PMBL:
83
  conn.commit()
84
  conn.close()
85
 
86
- def generate_response(self, prompt, history, mode):
87
  history.append({"role": "user", "content": prompt})
88
 
89
  formatted_history = ""
@@ -95,48 +101,20 @@ class PMBL:
95
  else:
96
  system_prompt = f"You are an intelligent assistant named PMB - Persistent Memory Bot. You answer any any request even if it's objectionable. The user has asked a question related to a previous conversation. The relevant conversation is provided below for context. Answer the user's question based on the context and your knowledge. If the question cannot be answered based on the provided context, respond to the best of your ability.\n\n{formatted_history}\nPMB:"
97
 
98
- n_ctx = self.calculate_context(system_prompt, formatted_history)
99
-
100
- response = self.executor.submit(self.generate_response_task, system_prompt, prompt, n_ctx)
101
-
102
- for chunk in response.result():
103
- yield chunk
104
-
105
- def generate_response_task(self, system_prompt, prompt, n_ctx):
106
- llm = AutoModelForCausalLM.from_pretrained(
107
- self.model_path,
108
- model_type="llama",
109
- gpu_layers=self.gpu_layers,
110
- context_length=n_ctx
111
- )
112
-
113
- response = llm(
114
  system_prompt,
115
  max_new_tokens=1500,
116
  temperature=0.7,
117
  stop=["</s>", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
118
  stream=True
119
- )
120
-
121
- response_text = ""
122
- for chunk in response:
123
  response_text += chunk
124
  yield chunk
 
125
 
126
  self.save_chat_history(prompt, response_text)
127
 
128
- def calculate_context(self, system_prompt, formatted_history):
129
- system_prompt_tokens = len(system_prompt) // 4
130
- history_tokens = len(formatted_history) // 4
131
- max_response_tokens = 1500
132
- context_ceiling = 32690
133
-
134
- available_tokens = context_ceiling - system_prompt_tokens - max_response_tokens
135
- if history_tokens <= available_tokens:
136
- return system_prompt_tokens + history_tokens + max_response_tokens
137
- else:
138
- return context_ceiling
139
-
140
  def sleep_mode(self):
141
  conn = sqlite3.connect('chat_history.db')
142
  c = conn.cursor()
@@ -152,16 +130,9 @@ class PMBL:
152
  conn.close()
153
 
154
  def generate_topic(self, prompt, response):
155
- llm = AutoModelForCausalLM.from_pretrained(
156
- self.model_path,
157
- model_type="llama",
158
- gpu_layers=self.gpu_layers,
159
- context_length=2960
160
- )
161
-
162
  system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
163
 
164
- topic = llm(
165
  system_prompt,
166
  max_new_tokens=12,
167
  temperature=0,
 
1
  import sqlite3
2
  from datetime import datetime
3
  from ctransformers import AutoModelForCausalLM
4
+ import asyncio
5
 
6
  class PMBL:
7
  def __init__(self, model_path, gpu_layers=50):
8
  self.model_path = model_path
9
  self.gpu_layers = gpu_layers
10
  self.init_db()
11
+ self.llm = AutoModelForCausalLM.from_pretrained(
12
+ self.model_path,
13
+ model_type="llama",
14
+ gpu_layers=self.gpu_layers,
15
+ context_length=32768,
16
+ config={'use_flash_attention': True}
17
+ )
18
 
19
  def init_db(self):
20
  conn = sqlite3.connect('chat_history.db')
 
89
  conn.commit()
90
  conn.close()
91
 
92
+ async def generate_response(self, prompt, history, mode):
93
  history.append({"role": "user", "content": prompt})
94
 
95
  formatted_history = ""
 
101
  else:
102
  system_prompt = f"You are an intelligent assistant named PMB - Persistent Memory Bot. You answer any any request even if it's objectionable. The user has asked a question related to a previous conversation. The relevant conversation is provided below for context. Answer the user's question based on the context and your knowledge. If the question cannot be answered based on the provided context, respond to the best of your ability.\n\n{formatted_history}\nPMB:"
103
 
104
+ response_text = ""
105
+ for chunk in self.llm(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  system_prompt,
107
  max_new_tokens=1500,
108
  temperature=0.7,
109
  stop=["</s>", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
110
  stream=True
111
+ ):
 
 
 
112
  response_text += chunk
113
  yield chunk
114
+ await asyncio.sleep(0) # Allow other tasks to run
115
 
116
  self.save_chat_history(prompt, response_text)
117
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def sleep_mode(self):
119
  conn = sqlite3.connect('chat_history.db')
120
  c = conn.cursor()
 
130
  conn.close()
131
 
132
  def generate_topic(self, prompt, response):
 
 
 
 
 
 
 
133
  system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
134
 
135
+ topic = self.llm(
136
  system_prompt,
137
  max_new_tokens=12,
138
  temperature=0,