Spaces:
Sleeping
Sleeping
Tuchuanhuhuhu
commited on
Commit
·
6a2dc28
1
Parent(s):
d708c00
ChatGLM可以用了
Browse files- ChuanhuChatbot.py +2 -2
- modules/base_model.py +18 -6
- modules/models.py +136 -21
- modules/presets.py +3 -0
- requirements.txt +4 -0
ChuanhuChatbot.py
CHANGED
@@ -22,7 +22,7 @@ with gr.Blocks(css=customCSS, theme=small_and_beautiful_theme) as demo:
|
|
22 |
user_name = gr.State("")
|
23 |
promptTemplates = gr.State(load_template(get_template_names(plain=True)[0], mode=2))
|
24 |
user_question = gr.State("")
|
25 |
-
current_model = gr.State(get_model(MODELS[
|
26 |
|
27 |
topic = gr.State("未命名对话历史记录")
|
28 |
|
@@ -78,7 +78,7 @@ with gr.Blocks(css=customCSS, theme=small_and_beautiful_theme) as demo:
|
|
78 |
else:
|
79 |
usageTxt = gr.Markdown("**发送消息** 或 **提交key** 以显示额度", elem_id="usage_display")
|
80 |
model_select_dropdown = gr.Dropdown(
|
81 |
-
label="选择模型", choices=MODELS, multiselect=False, value=MODELS[
|
82 |
)
|
83 |
use_streaming_checkbox = gr.Checkbox(
|
84 |
label="实时传输回答", value=True, visible=ENABLE_STREAMING_OPTION
|
|
|
22 |
user_name = gr.State("")
|
23 |
promptTemplates = gr.State(load_template(get_template_names(plain=True)[0], mode=2))
|
24 |
user_question = gr.State("")
|
25 |
+
current_model = gr.State(get_model(MODELS[DEFAULT_MODEL], my_api_key)[0])
|
26 |
|
27 |
topic = gr.State("未命名对话历史记录")
|
28 |
|
|
|
78 |
else:
|
79 |
usageTxt = gr.Markdown("**发送消息** 或 **提交key** 以显示额度", elem_id="usage_display")
|
80 |
model_select_dropdown = gr.Dropdown(
|
81 |
+
label="选择模型", choices=MODELS, multiselect=False, value=MODELS[DEFAULT_MODEL], interactive=True
|
82 |
)
|
83 |
use_streaming_checkbox = gr.Checkbox(
|
84 |
label="实时传输回答", value=True, visible=ENABLE_STREAMING_OPTION
|
modules/base_model.py
CHANGED
@@ -33,7 +33,7 @@ class ModelType(Enum):
|
|
33 |
model_type = None
|
34 |
if "gpt" in model_name.lower():
|
35 |
model_type = ModelType.OpenAI
|
36 |
-
elif "chatglm" in model_name.
|
37 |
model_type = ModelType.ChatGLM
|
38 |
else:
|
39 |
model_type = ModelType.LLaMA
|
@@ -59,7 +59,10 @@ class BaseLLMModel:
|
|
59 |
self.all_token_counts = []
|
60 |
self.model_name = model_name
|
61 |
self.model_type = ModelType.get_type(model_name)
|
62 |
-
|
|
|
|
|
|
|
63 |
self.interrupted = False
|
64 |
self.system_prompt = system_prompt
|
65 |
self.api_key = None
|
@@ -79,7 +82,9 @@ class BaseLLMModel:
|
|
79 |
conversations are stored in self.history, with the most recent question, in OpenAI format
|
80 |
should return a generator, each time give the next word (str) in the answer
|
81 |
"""
|
82 |
-
|
|
|
|
|
83 |
|
84 |
def get_answer_at_once(self):
|
85 |
"""predict at once, need to be implemented
|
@@ -88,15 +93,22 @@ class BaseLLMModel:
|
|
88 |
the answer (str)
|
89 |
total token count (int)
|
90 |
"""
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
def billing_info(self):
|
94 |
"""get billing infomation, inplement if needed"""
|
|
|
95 |
return BILLING_NOT_APPLICABLE_MSG
|
96 |
|
97 |
def count_token(self, user_input):
|
98 |
"""get token count from input, implement if needed"""
|
99 |
-
|
|
|
100 |
|
101 |
def stream_next_chatbot(self, inputs, chatbot, fake_input=None, display_append=""):
|
102 |
def get_return_value():
|
@@ -234,7 +246,7 @@ class BaseLLMModel:
|
|
234 |
else:
|
235 |
display_reference = ""
|
236 |
|
237 |
-
if len(self.api_key) == 0 and not shared.state.multi_api_key:
|
238 |
status_text = STANDARD_ERROR_MSG + NO_APIKEY_MSG
|
239 |
logging.info(status_text)
|
240 |
chatbot.append((inputs, ""))
|
|
|
33 |
model_type = None
|
34 |
if "gpt" in model_name.lower():
|
35 |
model_type = ModelType.OpenAI
|
36 |
+
elif "chatglm" in model_name.lower():
|
37 |
model_type = ModelType.ChatGLM
|
38 |
else:
|
39 |
model_type = ModelType.LLaMA
|
|
|
59 |
self.all_token_counts = []
|
60 |
self.model_name = model_name
|
61 |
self.model_type = ModelType.get_type(model_name)
|
62 |
+
try:
|
63 |
+
self.token_upper_limit = MODEL_TOKEN_LIMIT[model_name]
|
64 |
+
except KeyError:
|
65 |
+
self.token_upper_limit = DEFAULT_TOKEN_LIMIT
|
66 |
self.interrupted = False
|
67 |
self.system_prompt = system_prompt
|
68 |
self.api_key = None
|
|
|
82 |
conversations are stored in self.history, with the most recent question, in OpenAI format
|
83 |
should return a generator, each time give the next word (str) in the answer
|
84 |
"""
|
85 |
+
logging.warning("stream predict not implemented, using at once predict instead")
|
86 |
+
response, _ = self.get_answer_at_once()
|
87 |
+
yield response
|
88 |
|
89 |
def get_answer_at_once(self):
|
90 |
"""predict at once, need to be implemented
|
|
|
93 |
the answer (str)
|
94 |
total token count (int)
|
95 |
"""
|
96 |
+
logging.warning("at once predict not implemented, using stream predict instead")
|
97 |
+
response_iter = self.get_answer_stream_iter()
|
98 |
+
count = 0
|
99 |
+
for response in response_iter:
|
100 |
+
count += 1
|
101 |
+
return response, sum(self.all_token_counts) + count
|
102 |
|
103 |
def billing_info(self):
|
104 |
"""get billing infomation, inplement if needed"""
|
105 |
+
logging.warning("billing info not implemented, using default")
|
106 |
return BILLING_NOT_APPLICABLE_MSG
|
107 |
|
108 |
def count_token(self, user_input):
|
109 |
"""get token count from input, implement if needed"""
|
110 |
+
logging.warning("token count not implemented, using default")
|
111 |
+
return len(user_input)
|
112 |
|
113 |
def stream_next_chatbot(self, inputs, chatbot, fake_input=None, display_append=""):
|
114 |
def get_return_value():
|
|
|
246 |
else:
|
247 |
display_reference = ""
|
248 |
|
249 |
+
if self.api_key is not None and len(self.api_key) == 0 and not shared.state.multi_api_key:
|
250 |
status_text = STANDARD_ERROR_MSG + NO_APIKEY_MSG
|
251 |
logging.info(status_text)
|
252 |
chatbot.append((inputs, ""))
|
modules/models.py
CHANGED
@@ -10,6 +10,14 @@ import requests
|
|
10 |
import urllib3
|
11 |
import platform
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
from tqdm import tqdm
|
14 |
import colorama
|
15 |
from duckduckgo_search import ddg
|
@@ -213,27 +221,39 @@ class ChatGLM_Client(BaseLLMModel):
|
|
213 |
else:
|
214 |
model_source = f"THUDM/{model_name}"
|
215 |
self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
if torch.cuda.is_available():
|
217 |
# run on CUDA
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
|
|
|
|
|
|
222 |
else:
|
223 |
-
|
224 |
-
model = AutoModel.from_pretrained(model_source, trust_remote_code=True).float()
|
225 |
model = model.eval()
|
226 |
self.model = model
|
227 |
|
228 |
def _get_glm_style_input(self):
|
229 |
history = [x["content"] for x in self.history]
|
230 |
query = history.pop()
|
|
|
|
|
|
|
231 |
return history, query
|
232 |
|
233 |
def get_answer_at_once(self):
|
234 |
history, query = self._get_glm_style_input()
|
235 |
response, _ = self.model.chat(self.tokenizer, query, history=history)
|
236 |
-
return response
|
237 |
|
238 |
def get_answer_stream_iter(self):
|
239 |
history, query = self._get_glm_style_input()
|
@@ -241,6 +261,100 @@ class ChatGLM_Client(BaseLLMModel):
|
|
241 |
temperature=self.temperature):
|
242 |
yield response
|
243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
|
245 |
def get_model(
|
246 |
model_name, access_key=None, temperature=None, top_p=None, system_prompt=None
|
@@ -248,7 +362,7 @@ def get_model(
|
|
248 |
msg = f"模型设置为了: {model_name}"
|
249 |
logging.info(msg)
|
250 |
model_type = ModelType.get_type(model_name)
|
251 |
-
|
252 |
if model_type == ModelType.OpenAI:
|
253 |
model = OpenAIClient(
|
254 |
model_name=model_name,
|
@@ -265,29 +379,30 @@ def get_model(
|
|
265 |
if __name__ == "__main__":
|
266 |
with open("config.json", "r") as f:
|
267 |
openai_api_key = cjson.load(f)["openai_api_key"]
|
268 |
-
client =
|
|
|
269 |
chatbot = []
|
270 |
-
stream =
|
271 |
# 测试账单功能
|
272 |
-
|
273 |
-
|
274 |
# 测试问答
|
275 |
-
|
276 |
question = "巴黎是中国的首都吗?"
|
277 |
for i in client.predict(inputs=question, chatbot=chatbot, stream=stream):
|
278 |
-
|
279 |
-
|
280 |
# 测试记忆力
|
281 |
-
|
282 |
question = "我刚刚问了你什么问题?"
|
283 |
for i in client.predict(inputs=question, chatbot=chatbot, stream=stream):
|
284 |
-
|
285 |
-
|
286 |
# 测试重试功能
|
287 |
-
|
288 |
for i in client.retry(chatbot=chatbot, stream=stream):
|
289 |
-
|
290 |
-
|
291 |
# # 测试总结功能
|
292 |
# print(colorama.Back.GREEN + "测试总结功能" + colorama.Back.RESET)
|
293 |
# chatbot, msg = client.reduce_token_size(chatbot=chatbot)
|
|
|
10 |
import urllib3
|
11 |
import platform
|
12 |
|
13 |
+
from dataclasses import dataclass, field
|
14 |
+
from transformers import HfArgumentParser
|
15 |
+
|
16 |
+
from lmflow.datasets.dataset import Dataset
|
17 |
+
from lmflow.pipeline.auto_pipeline import AutoPipeline
|
18 |
+
from lmflow.models.auto_model import AutoModel
|
19 |
+
from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
|
20 |
+
|
21 |
from tqdm import tqdm
|
22 |
import colorama
|
23 |
from duckduckgo_search import ddg
|
|
|
221 |
else:
|
222 |
model_source = f"THUDM/{model_name}"
|
223 |
self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True)
|
224 |
+
quantified = False
|
225 |
+
if "int4" in model_name:
|
226 |
+
quantified = True
|
227 |
+
if quantified:
|
228 |
+
model = AutoModel.from_pretrained(model_source, trust_remote_code=True).float()
|
229 |
+
else:
|
230 |
+
model = AutoModel.from_pretrained(model_source, trust_remote_code=True).half()
|
231 |
if torch.cuda.is_available():
|
232 |
# run on CUDA
|
233 |
+
logging.info("CUDA is available, using CUDA")
|
234 |
+
model = model.cuda()
|
235 |
+
# mps加速还存在一些问题,暂时不使用
|
236 |
+
# elif system_name == "Darwin" and model_path is not None:
|
237 |
+
# logging.info("Running on macOS, using MPS")
|
238 |
+
# # running on macOS and model already downloaded
|
239 |
+
# model = model.to('mps')
|
240 |
else:
|
241 |
+
logging.info("GPU is not available, using CPU")
|
|
|
242 |
model = model.eval()
|
243 |
self.model = model
|
244 |
|
245 |
def _get_glm_style_input(self):
|
246 |
history = [x["content"] for x in self.history]
|
247 |
query = history.pop()
|
248 |
+
logging.info(colorama.Fore.YELLOW + f"{history}" + colorama.Fore.RESET)
|
249 |
+
assert len(history) % 2 == 0
|
250 |
+
history = [[history[i], history[i+1]] for i in range(0, len(history), 2)]
|
251 |
return history, query
|
252 |
|
253 |
def get_answer_at_once(self):
|
254 |
history, query = self._get_glm_style_input()
|
255 |
response, _ = self.model.chat(self.tokenizer, query, history=history)
|
256 |
+
return response, len(response)
|
257 |
|
258 |
def get_answer_stream_iter(self):
|
259 |
history, query = self._get_glm_style_input()
|
|
|
261 |
temperature=self.temperature):
|
262 |
yield response
|
263 |
|
264 |
+
@dataclass
|
265 |
+
class ChatbotArguments:
|
266 |
+
pass
|
267 |
+
|
268 |
+
class LLaMA_Client(BaseLLMModel):
|
269 |
+
def __init__(
|
270 |
+
self,
|
271 |
+
model_name,
|
272 |
+
lora_path = None,
|
273 |
+
) -> None:
|
274 |
+
super().__init__(
|
275 |
+
model_name=model_name
|
276 |
+
)
|
277 |
+
self.max_generation_token = 1000
|
278 |
+
pipeline_name = "inferencer"
|
279 |
+
PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
|
280 |
+
|
281 |
+
parser = HfArgumentParser((
|
282 |
+
ModelArguments,
|
283 |
+
PipelineArguments,
|
284 |
+
ChatbotArguments,
|
285 |
+
))
|
286 |
+
model_args, pipeline_args, chatbot_args = (
|
287 |
+
parser.parse_args_into_dataclasses()
|
288 |
+
)
|
289 |
+
|
290 |
+
with open (pipeline_args.deepspeed, "r") as f:
|
291 |
+
ds_config = json.load(f)
|
292 |
+
|
293 |
+
self.model = AutoModel.get_model(
|
294 |
+
model_args,
|
295 |
+
tune_strategy='none',
|
296 |
+
ds_config=ds_config,
|
297 |
+
)
|
298 |
+
|
299 |
+
# We don't need input data, we will read interactively from stdin
|
300 |
+
data_args = DatasetArguments(dataset_path=None)
|
301 |
+
self.dataset = Dataset(data_args)
|
302 |
+
|
303 |
+
self.inferencer = AutoPipeline.get_pipeline(
|
304 |
+
pipeline_name=pipeline_name,
|
305 |
+
model_args=model_args,
|
306 |
+
data_args=data_args,
|
307 |
+
pipeline_args=pipeline_args,
|
308 |
+
)
|
309 |
+
|
310 |
+
# Chats
|
311 |
+
model_name = model_args.model_name_or_path
|
312 |
+
if model_args.lora_model_path is not None:
|
313 |
+
model_name += f" + {model_args.lora_model_path}"
|
314 |
+
|
315 |
+
# context = (
|
316 |
+
# "You are a helpful assistant who follows the given instructions"
|
317 |
+
# " unconditionally."
|
318 |
+
# )
|
319 |
+
self.end_string = "\n\n"
|
320 |
+
|
321 |
+
def _get_llama_style_input(self):
|
322 |
+
history = [x["content"] for x in self.history]
|
323 |
+
context = "\n".join(history)
|
324 |
+
return context
|
325 |
+
|
326 |
+
|
327 |
+
def get_answer_at_once(self):
|
328 |
+
context = self._get_llama_style_input()
|
329 |
+
|
330 |
+
input_dataset = self.dataset.from_dict({
|
331 |
+
"type": "text_only",
|
332 |
+
"instances": [ { "text": context } ]
|
333 |
+
})
|
334 |
+
|
335 |
+
output_dataset = self.inferencer.inference(
|
336 |
+
model=self.model,
|
337 |
+
dataset=input_dataset,
|
338 |
+
max_new_tokens=self.max_generation_token,
|
339 |
+
temperature=self.temperature,
|
340 |
+
)
|
341 |
+
|
342 |
+
response = output_dataset.to_dict()["instances"][0]["text"]
|
343 |
+
|
344 |
+
try:
|
345 |
+
index = response.index(self.end_string)
|
346 |
+
except ValueError:
|
347 |
+
response += self.end_string
|
348 |
+
index = response.index(self.end_string)
|
349 |
+
|
350 |
+
response = response[:index + 1]
|
351 |
+
return response, len(response)
|
352 |
+
|
353 |
+
def get_answer_stream_iter(self):
|
354 |
+
response, _ = self.get_answer_at_once()
|
355 |
+
yield response
|
356 |
+
|
357 |
+
|
358 |
|
359 |
def get_model(
|
360 |
model_name, access_key=None, temperature=None, top_p=None, system_prompt=None
|
|
|
362 |
msg = f"模型设置为了: {model_name}"
|
363 |
logging.info(msg)
|
364 |
model_type = ModelType.get_type(model_name)
|
365 |
+
print(model_type.name)
|
366 |
if model_type == ModelType.OpenAI:
|
367 |
model = OpenAIClient(
|
368 |
model_name=model_name,
|
|
|
379 |
if __name__ == "__main__":
|
380 |
with open("config.json", "r") as f:
|
381 |
openai_api_key = cjson.load(f)["openai_api_key"]
|
382 |
+
# client, _ = get_model("gpt-3.5-turbo", openai_api_key)
|
383 |
+
client, _ = get_model("chatglm-6b-int4")
|
384 |
chatbot = []
|
385 |
+
stream = True
|
386 |
# 测试账单功能
|
387 |
+
logging.info(colorama.Back.GREEN + "测试账单功能" + colorama.Back.RESET)
|
388 |
+
logging.info(client.billing_info())
|
389 |
# 测试问答
|
390 |
+
logging.info(colorama.Back.GREEN + "测试问答" + colorama.Back.RESET)
|
391 |
question = "巴黎是中国的首都吗?"
|
392 |
for i in client.predict(inputs=question, chatbot=chatbot, stream=stream):
|
393 |
+
logging.info(i)
|
394 |
+
logging.info(f"测试问答后history : {client.history}")
|
395 |
# 测试记忆力
|
396 |
+
logging.info(colorama.Back.GREEN + "测试记忆力" + colorama.Back.RESET)
|
397 |
question = "我刚刚问了你什么问题?"
|
398 |
for i in client.predict(inputs=question, chatbot=chatbot, stream=stream):
|
399 |
+
logging.info(i)
|
400 |
+
logging.info(f"测试记忆力后history : {client.history}")
|
401 |
# 测试重试功能
|
402 |
+
logging.info(colorama.Back.GREEN + "测试重试功能" + colorama.Back.RESET)
|
403 |
for i in client.retry(chatbot=chatbot, stream=stream):
|
404 |
+
logging.info(i)
|
405 |
+
logging.info(f"重试后history : {client.history}")
|
406 |
# # 测试总结功能
|
407 |
# print(colorama.Back.GREEN + "测试总结功能" + colorama.Back.RESET)
|
408 |
# chatbot, msg = client.reduce_token_size(chatbot=chatbot)
|
modules/presets.py
CHANGED
@@ -62,6 +62,8 @@ MODELS = [
|
|
62 |
"chatglm-6b-int4-qe"
|
63 |
] # 可选的模型
|
64 |
|
|
|
|
|
65 |
MODEL_TOKEN_LIMIT = {
|
66 |
"gpt-3.5-turbo": 4096,
|
67 |
"gpt-3.5-turbo-0301": 4096,
|
@@ -72,6 +74,7 @@ MODEL_TOKEN_LIMIT = {
|
|
72 |
}
|
73 |
|
74 |
TOKEN_OFFSET = 1000 # 模型的token上限减去这个值,得到软上限。到达软上限之后,自动尝试减少token占用。
|
|
|
75 |
REDUCE_TOKEN_FACTOR = 0.5 # 与模型token上限想乘,得到目标token数。减少token占用时,将token占用减少到目标token数以下。
|
76 |
|
77 |
REPLY_LANGUAGES = [
|
|
|
62 |
"chatglm-6b-int4-qe"
|
63 |
] # 可选的模型
|
64 |
|
65 |
+
DEFAULT_MODEL = 0 # 默认的模型在MODELS中的序号,从0开始数
|
66 |
+
|
67 |
MODEL_TOKEN_LIMIT = {
|
68 |
"gpt-3.5-turbo": 4096,
|
69 |
"gpt-3.5-turbo-0301": 4096,
|
|
|
74 |
}
|
75 |
|
76 |
TOKEN_OFFSET = 1000 # 模型的token上限减去这个值,得到软上限。到达软上限之后,自动尝试减少token占用。
|
77 |
+
DEFAULT_TOKEN_LIMIT = 3000 # 默认的token上限
|
78 |
REDUCE_TOKEN_FACTOR = 0.5 # 与模型token上限想乘,得到目标token数。减少token占用时,将token占用减少到目标token数以下。
|
79 |
|
80 |
REPLY_LANGUAGES = [
|
requirements.txt
CHANGED
@@ -15,3 +15,7 @@ pdfplumber
|
|
15 |
pandas
|
16 |
transformers
|
17 |
torch
|
|
|
|
|
|
|
|
|
|
15 |
pandas
|
16 |
transformers
|
17 |
torch
|
18 |
+
mpi4py
|
19 |
+
icetk
|
20 |
+
git+https://github.com/OptimalScale/LMFlow.git
|
21 |
+
cpm-kernels
|