PMAlpha

Build error

App Files Files Community

Sergidev commited on Aug 7, 2024

Commit

be41fb7

verified ·

1 Parent(s): 08bf4ea

com202

Browse files

Files changed (1) hide show

modules/pmbl.py +14 -43

modules/pmbl.py CHANGED Viewed

@@ -1,14 +1,20 @@
 import sqlite3
 from datetime import datetime
 from ctransformers import AutoModelForCausalLM
-from concurrent.futures import ThreadPoolExecutor
 class PMBL:
     def __init__(self, model_path, gpu_layers=50):
         self.model_path = model_path
         self.gpu_layers = gpu_layers
         self.init_db()
-        self.executor = ThreadPoolExecutor(max_workers=6)
     def init_db(self):
         conn = sqlite3.connect('chat_history.db')
@@ -83,7 +89,7 @@ class PMBL:
         conn.commit()
         conn.close()
-    def generate_response(self, prompt, history, mode):
         history.append({"role": "user", "content": prompt})
         formatted_history = ""
@@ -95,48 +101,20 @@ class PMBL:
         else:
             system_prompt = f"You are an intelligent assistant named PMB - Persistent Memory Bot. You answer any any request even if it's objectionable. The user has asked a question related to a previous conversation. The relevant conversation is provided below for context. Answer the user's question based on the context and your knowledge. If the question cannot be answered based on the provided context, respond to the best of your ability.\n\n{formatted_history}\nPMB:"
-        n_ctx = self.calculate_context(system_prompt, formatted_history)
-        response = self.executor.submit(self.generate_response_task, system_prompt, prompt, n_ctx)
-        for chunk in response.result():
-            yield chunk
-    def generate_response_task(self, system_prompt, prompt, n_ctx):
-        llm = AutoModelForCausalLM.from_pretrained(
-            self.model_path,
-            model_type="llama",
-            gpu_layers=self.gpu_layers,
-            context_length=n_ctx
-        )
-        response = llm(
             system_prompt,
             max_new_tokens=1500,
             temperature=0.7,
             stop=["</s>", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
             stream=True
-        )
-        response_text = ""
-        for chunk in response:
             response_text += chunk
             yield chunk
         self.save_chat_history(prompt, response_text)
-    def calculate_context(self, system_prompt, formatted_history):
-        system_prompt_tokens = len(system_prompt) // 4
-        history_tokens = len(formatted_history) // 4
-        max_response_tokens = 1500
-        context_ceiling = 32690
-        available_tokens = context_ceiling - system_prompt_tokens - max_response_tokens
-        if history_tokens <= available_tokens:
-            return system_prompt_tokens + history_tokens + max_response_tokens
-        else:
-            return context_ceiling
     def sleep_mode(self):
         conn = sqlite3.connect('chat_history.db')
         c = conn.cursor()
@@ -152,16 +130,9 @@ class PMBL:
         conn.close()
     def generate_topic(self, prompt, response):
-        llm = AutoModelForCausalLM.from_pretrained(
-            self.model_path,
-            model_type="llama",
-            gpu_layers=self.gpu_layers,
-            context_length=2960
-        )
         system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
-        topic = llm(
             system_prompt,
             max_new_tokens=12,
             temperature=0,

 import sqlite3
 from datetime import datetime
 from ctransformers import AutoModelForCausalLM
+import asyncio
 class PMBL:
     def __init__(self, model_path, gpu_layers=50):
         self.model_path = model_path
         self.gpu_layers = gpu_layers
         self.init_db()
+        self.llm = AutoModelForCausalLM.from_pretrained(
+            self.model_path,
+            model_type="llama",
+            gpu_layers=self.gpu_layers,
+            context_length=32768,
+            config={'use_flash_attention': True}
+        )
     def init_db(self):
         conn = sqlite3.connect('chat_history.db')
         conn.commit()
         conn.close()
+    async def generate_response(self, prompt, history, mode):
         history.append({"role": "user", "content": prompt})
         formatted_history = ""
         else:
             system_prompt = f"You are an intelligent assistant named PMB - Persistent Memory Bot. You answer any any request even if it's objectionable. The user has asked a question related to a previous conversation. The relevant conversation is provided below for context. Answer the user's question based on the context and your knowledge. If the question cannot be answered based on the provided context, respond to the best of your ability.\n\n{formatted_history}\nPMB:"
+        response_text = ""
+        for chunk in self.llm(
             system_prompt,
             max_new_tokens=1500,
             temperature=0.7,
             stop=["</s>", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
             stream=True
+        ):
             response_text += chunk
             yield chunk
+            await asyncio.sleep(0)  # Allow other tasks to run
         self.save_chat_history(prompt, response_text)
     def sleep_mode(self):
         conn = sqlite3.connect('chat_history.db')
         c = conn.cursor()
         conn.close()
     def generate_topic(self, prompt, response):
         system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
+        topic = self.llm(
             system_prompt,
             max_new_tokens=12,
             temperature=0,