Tuchuanhuhuhu commited on
Commit
6a2dc28
·
1 Parent(s): d708c00

ChatGLM可以用了

Browse files
ChuanhuChatbot.py CHANGED
@@ -22,7 +22,7 @@ with gr.Blocks(css=customCSS, theme=small_and_beautiful_theme) as demo:
22
  user_name = gr.State("")
23
  promptTemplates = gr.State(load_template(get_template_names(plain=True)[0], mode=2))
24
  user_question = gr.State("")
25
- current_model = gr.State(get_model(MODELS[0], my_api_key)[0])
26
 
27
  topic = gr.State("未命名对话历史记录")
28
 
@@ -78,7 +78,7 @@ with gr.Blocks(css=customCSS, theme=small_and_beautiful_theme) as demo:
78
  else:
79
  usageTxt = gr.Markdown("**发送消息** 或 **提交key** 以显示额度", elem_id="usage_display")
80
  model_select_dropdown = gr.Dropdown(
81
- label="选择模型", choices=MODELS, multiselect=False, value=MODELS[0], interactive=True
82
  )
83
  use_streaming_checkbox = gr.Checkbox(
84
  label="实时传输回答", value=True, visible=ENABLE_STREAMING_OPTION
 
22
  user_name = gr.State("")
23
  promptTemplates = gr.State(load_template(get_template_names(plain=True)[0], mode=2))
24
  user_question = gr.State("")
25
+ current_model = gr.State(get_model(MODELS[DEFAULT_MODEL], my_api_key)[0])
26
 
27
  topic = gr.State("未命名对话历史记录")
28
 
 
78
  else:
79
  usageTxt = gr.Markdown("**发送消息** 或 **提交key** 以显示额度", elem_id="usage_display")
80
  model_select_dropdown = gr.Dropdown(
81
+ label="选择模型", choices=MODELS, multiselect=False, value=MODELS[DEFAULT_MODEL], interactive=True
82
  )
83
  use_streaming_checkbox = gr.Checkbox(
84
  label="实时传输回答", value=True, visible=ENABLE_STREAMING_OPTION
modules/base_model.py CHANGED
@@ -33,7 +33,7 @@ class ModelType(Enum):
33
  model_type = None
34
  if "gpt" in model_name.lower():
35
  model_type = ModelType.OpenAI
36
- elif "chatglm" in model_name.upper():
37
  model_type = ModelType.ChatGLM
38
  else:
39
  model_type = ModelType.LLaMA
@@ -59,7 +59,10 @@ class BaseLLMModel:
59
  self.all_token_counts = []
60
  self.model_name = model_name
61
  self.model_type = ModelType.get_type(model_name)
62
- self.token_upper_limit = MODEL_TOKEN_LIMIT[model_name]
 
 
 
63
  self.interrupted = False
64
  self.system_prompt = system_prompt
65
  self.api_key = None
@@ -79,7 +82,9 @@ class BaseLLMModel:
79
  conversations are stored in self.history, with the most recent question, in OpenAI format
80
  should return a generator, each time give the next word (str) in the answer
81
  """
82
- pass
 
 
83
 
84
  def get_answer_at_once(self):
85
  """predict at once, need to be implemented
@@ -88,15 +93,22 @@ class BaseLLMModel:
88
  the answer (str)
89
  total token count (int)
90
  """
91
- pass
 
 
 
 
 
92
 
93
  def billing_info(self):
94
  """get billing infomation, inplement if needed"""
 
95
  return BILLING_NOT_APPLICABLE_MSG
96
 
97
  def count_token(self, user_input):
98
  """get token count from input, implement if needed"""
99
- return 0
 
100
 
101
  def stream_next_chatbot(self, inputs, chatbot, fake_input=None, display_append=""):
102
  def get_return_value():
@@ -234,7 +246,7 @@ class BaseLLMModel:
234
  else:
235
  display_reference = ""
236
 
237
- if len(self.api_key) == 0 and not shared.state.multi_api_key:
238
  status_text = STANDARD_ERROR_MSG + NO_APIKEY_MSG
239
  logging.info(status_text)
240
  chatbot.append((inputs, ""))
 
33
  model_type = None
34
  if "gpt" in model_name.lower():
35
  model_type = ModelType.OpenAI
36
+ elif "chatglm" in model_name.lower():
37
  model_type = ModelType.ChatGLM
38
  else:
39
  model_type = ModelType.LLaMA
 
59
  self.all_token_counts = []
60
  self.model_name = model_name
61
  self.model_type = ModelType.get_type(model_name)
62
+ try:
63
+ self.token_upper_limit = MODEL_TOKEN_LIMIT[model_name]
64
+ except KeyError:
65
+ self.token_upper_limit = DEFAULT_TOKEN_LIMIT
66
  self.interrupted = False
67
  self.system_prompt = system_prompt
68
  self.api_key = None
 
82
  conversations are stored in self.history, with the most recent question, in OpenAI format
83
  should return a generator, each time give the next word (str) in the answer
84
  """
85
+ logging.warning("stream predict not implemented, using at once predict instead")
86
+ response, _ = self.get_answer_at_once()
87
+ yield response
88
 
89
  def get_answer_at_once(self):
90
  """predict at once, need to be implemented
 
93
  the answer (str)
94
  total token count (int)
95
  """
96
+ logging.warning("at once predict not implemented, using stream predict instead")
97
+ response_iter = self.get_answer_stream_iter()
98
+ count = 0
99
+ for response in response_iter:
100
+ count += 1
101
+ return response, sum(self.all_token_counts) + count
102
 
103
  def billing_info(self):
104
  """get billing infomation, inplement if needed"""
105
+ logging.warning("billing info not implemented, using default")
106
  return BILLING_NOT_APPLICABLE_MSG
107
 
108
  def count_token(self, user_input):
109
  """get token count from input, implement if needed"""
110
+ logging.warning("token count not implemented, using default")
111
+ return len(user_input)
112
 
113
  def stream_next_chatbot(self, inputs, chatbot, fake_input=None, display_append=""):
114
  def get_return_value():
 
246
  else:
247
  display_reference = ""
248
 
249
+ if self.api_key is not None and len(self.api_key) == 0 and not shared.state.multi_api_key:
250
  status_text = STANDARD_ERROR_MSG + NO_APIKEY_MSG
251
  logging.info(status_text)
252
  chatbot.append((inputs, ""))
modules/models.py CHANGED
@@ -10,6 +10,14 @@ import requests
10
  import urllib3
11
  import platform
12
 
 
 
 
 
 
 
 
 
13
  from tqdm import tqdm
14
  import colorama
15
  from duckduckgo_search import ddg
@@ -213,27 +221,39 @@ class ChatGLM_Client(BaseLLMModel):
213
  else:
214
  model_source = f"THUDM/{model_name}"
215
  self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True)
 
 
 
 
 
 
 
216
  if torch.cuda.is_available():
217
  # run on CUDA
218
- model = AutoModel.from_pretrained(model_source, trust_remote_code=True).half().cuda()
219
- elif system_name == "Darwin" and model_path is not None:
220
- # running on macOS and model already downloaded
221
- model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().to('mps')
 
 
 
222
  else:
223
- # run on CPU
224
- model = AutoModel.from_pretrained(model_source, trust_remote_code=True).float()
225
  model = model.eval()
226
  self.model = model
227
 
228
  def _get_glm_style_input(self):
229
  history = [x["content"] for x in self.history]
230
  query = history.pop()
 
 
 
231
  return history, query
232
 
233
  def get_answer_at_once(self):
234
  history, query = self._get_glm_style_input()
235
  response, _ = self.model.chat(self.tokenizer, query, history=history)
236
- return response
237
 
238
  def get_answer_stream_iter(self):
239
  history, query = self._get_glm_style_input()
@@ -241,6 +261,100 @@ class ChatGLM_Client(BaseLLMModel):
241
  temperature=self.temperature):
242
  yield response
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
  def get_model(
246
  model_name, access_key=None, temperature=None, top_p=None, system_prompt=None
@@ -248,7 +362,7 @@ def get_model(
248
  msg = f"模型设置为了: {model_name}"
249
  logging.info(msg)
250
  model_type = ModelType.get_type(model_name)
251
- del model
252
  if model_type == ModelType.OpenAI:
253
  model = OpenAIClient(
254
  model_name=model_name,
@@ -265,29 +379,30 @@ def get_model(
265
  if __name__ == "__main__":
266
  with open("config.json", "r") as f:
267
  openai_api_key = cjson.load(f)["openai_api_key"]
268
- client = OpenAIClient("gpt-3.5-turbo", openai_api_key)
 
269
  chatbot = []
270
- stream = False
271
  # 测试账单功能
272
- print(colorama.Back.GREEN + "测试账单功能" + colorama.Back.RESET)
273
- print(client.billing_info())
274
  # 测试问答
275
- print(colorama.Back.GREEN + "测试问答" + colorama.Back.RESET)
276
  question = "巴黎是中国的首都吗?"
277
  for i in client.predict(inputs=question, chatbot=chatbot, stream=stream):
278
- print(i)
279
- print(f"测试问答后history : {client.history}")
280
  # 测试记忆力
281
- print(colorama.Back.GREEN + "测试记忆力" + colorama.Back.RESET)
282
  question = "我刚刚问了你什么问题?"
283
  for i in client.predict(inputs=question, chatbot=chatbot, stream=stream):
284
- print(i)
285
- print(f"测试记忆力后history : {client.history}")
286
  # 测试重试功能
287
- print(colorama.Back.GREEN + "测试重试功能" + colorama.Back.RESET)
288
  for i in client.retry(chatbot=chatbot, stream=stream):
289
- print(i)
290
- print(f"重试后history : {client.history}")
291
  # # 测试总结功能
292
  # print(colorama.Back.GREEN + "测试总结功能" + colorama.Back.RESET)
293
  # chatbot, msg = client.reduce_token_size(chatbot=chatbot)
 
10
  import urllib3
11
  import platform
12
 
13
+ from dataclasses import dataclass, field
14
+ from transformers import HfArgumentParser
15
+
16
+ from lmflow.datasets.dataset import Dataset
17
+ from lmflow.pipeline.auto_pipeline import AutoPipeline
18
+ from lmflow.models.auto_model import AutoModel
19
+ from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
20
+
21
  from tqdm import tqdm
22
  import colorama
23
  from duckduckgo_search import ddg
 
221
  else:
222
  model_source = f"THUDM/{model_name}"
223
  self.tokenizer = AutoTokenizer.from_pretrained(model_source, trust_remote_code=True)
224
+ quantified = False
225
+ if "int4" in model_name:
226
+ quantified = True
227
+ if quantified:
228
+ model = AutoModel.from_pretrained(model_source, trust_remote_code=True).float()
229
+ else:
230
+ model = AutoModel.from_pretrained(model_source, trust_remote_code=True).half()
231
  if torch.cuda.is_available():
232
  # run on CUDA
233
+ logging.info("CUDA is available, using CUDA")
234
+ model = model.cuda()
235
+ # mps加速还存在一些问题,暂时不使用
236
+ # elif system_name == "Darwin" and model_path is not None:
237
+ # logging.info("Running on macOS, using MPS")
238
+ # # running on macOS and model already downloaded
239
+ # model = model.to('mps')
240
  else:
241
+ logging.info("GPU is not available, using CPU")
 
242
  model = model.eval()
243
  self.model = model
244
 
245
  def _get_glm_style_input(self):
246
  history = [x["content"] for x in self.history]
247
  query = history.pop()
248
+ logging.info(colorama.Fore.YELLOW + f"{history}" + colorama.Fore.RESET)
249
+ assert len(history) % 2 == 0
250
+ history = [[history[i], history[i+1]] for i in range(0, len(history), 2)]
251
  return history, query
252
 
253
  def get_answer_at_once(self):
254
  history, query = self._get_glm_style_input()
255
  response, _ = self.model.chat(self.tokenizer, query, history=history)
256
+ return response, len(response)
257
 
258
  def get_answer_stream_iter(self):
259
  history, query = self._get_glm_style_input()
 
261
  temperature=self.temperature):
262
  yield response
263
 
264
+ @dataclass
265
+ class ChatbotArguments:
266
+ pass
267
+
268
+ class LLaMA_Client(BaseLLMModel):
269
+ def __init__(
270
+ self,
271
+ model_name,
272
+ lora_path = None,
273
+ ) -> None:
274
+ super().__init__(
275
+ model_name=model_name
276
+ )
277
+ self.max_generation_token = 1000
278
+ pipeline_name = "inferencer"
279
+ PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
280
+
281
+ parser = HfArgumentParser((
282
+ ModelArguments,
283
+ PipelineArguments,
284
+ ChatbotArguments,
285
+ ))
286
+ model_args, pipeline_args, chatbot_args = (
287
+ parser.parse_args_into_dataclasses()
288
+ )
289
+
290
+ with open (pipeline_args.deepspeed, "r") as f:
291
+ ds_config = json.load(f)
292
+
293
+ self.model = AutoModel.get_model(
294
+ model_args,
295
+ tune_strategy='none',
296
+ ds_config=ds_config,
297
+ )
298
+
299
+ # We don't need input data, we will read interactively from stdin
300
+ data_args = DatasetArguments(dataset_path=None)
301
+ self.dataset = Dataset(data_args)
302
+
303
+ self.inferencer = AutoPipeline.get_pipeline(
304
+ pipeline_name=pipeline_name,
305
+ model_args=model_args,
306
+ data_args=data_args,
307
+ pipeline_args=pipeline_args,
308
+ )
309
+
310
+ # Chats
311
+ model_name = model_args.model_name_or_path
312
+ if model_args.lora_model_path is not None:
313
+ model_name += f" + {model_args.lora_model_path}"
314
+
315
+ # context = (
316
+ # "You are a helpful assistant who follows the given instructions"
317
+ # " unconditionally."
318
+ # )
319
+ self.end_string = "\n\n"
320
+
321
+ def _get_llama_style_input(self):
322
+ history = [x["content"] for x in self.history]
323
+ context = "\n".join(history)
324
+ return context
325
+
326
+
327
+ def get_answer_at_once(self):
328
+ context = self._get_llama_style_input()
329
+
330
+ input_dataset = self.dataset.from_dict({
331
+ "type": "text_only",
332
+ "instances": [ { "text": context } ]
333
+ })
334
+
335
+ output_dataset = self.inferencer.inference(
336
+ model=self.model,
337
+ dataset=input_dataset,
338
+ max_new_tokens=self.max_generation_token,
339
+ temperature=self.temperature,
340
+ )
341
+
342
+ response = output_dataset.to_dict()["instances"][0]["text"]
343
+
344
+ try:
345
+ index = response.index(self.end_string)
346
+ except ValueError:
347
+ response += self.end_string
348
+ index = response.index(self.end_string)
349
+
350
+ response = response[:index + 1]
351
+ return response, len(response)
352
+
353
+ def get_answer_stream_iter(self):
354
+ response, _ = self.get_answer_at_once()
355
+ yield response
356
+
357
+
358
 
359
  def get_model(
360
  model_name, access_key=None, temperature=None, top_p=None, system_prompt=None
 
362
  msg = f"模型设置为了: {model_name}"
363
  logging.info(msg)
364
  model_type = ModelType.get_type(model_name)
365
+ print(model_type.name)
366
  if model_type == ModelType.OpenAI:
367
  model = OpenAIClient(
368
  model_name=model_name,
 
379
  if __name__ == "__main__":
380
  with open("config.json", "r") as f:
381
  openai_api_key = cjson.load(f)["openai_api_key"]
382
+ # client, _ = get_model("gpt-3.5-turbo", openai_api_key)
383
+ client, _ = get_model("chatglm-6b-int4")
384
  chatbot = []
385
+ stream = True
386
  # 测试账单功能
387
+ logging.info(colorama.Back.GREEN + "测试账单功能" + colorama.Back.RESET)
388
+ logging.info(client.billing_info())
389
  # 测试问答
390
+ logging.info(colorama.Back.GREEN + "测试问答" + colorama.Back.RESET)
391
  question = "巴黎是中国的首都吗?"
392
  for i in client.predict(inputs=question, chatbot=chatbot, stream=stream):
393
+ logging.info(i)
394
+ logging.info(f"测试问答后history : {client.history}")
395
  # 测试记忆力
396
+ logging.info(colorama.Back.GREEN + "测试记忆力" + colorama.Back.RESET)
397
  question = "我刚刚问了你什么问题?"
398
  for i in client.predict(inputs=question, chatbot=chatbot, stream=stream):
399
+ logging.info(i)
400
+ logging.info(f"测试记忆力后history : {client.history}")
401
  # 测试重试功能
402
+ logging.info(colorama.Back.GREEN + "测试重试功能" + colorama.Back.RESET)
403
  for i in client.retry(chatbot=chatbot, stream=stream):
404
+ logging.info(i)
405
+ logging.info(f"重试后history : {client.history}")
406
  # # 测试总结功能
407
  # print(colorama.Back.GREEN + "测试总结功能" + colorama.Back.RESET)
408
  # chatbot, msg = client.reduce_token_size(chatbot=chatbot)
modules/presets.py CHANGED
@@ -62,6 +62,8 @@ MODELS = [
62
  "chatglm-6b-int4-qe"
63
  ] # 可选的模型
64
 
 
 
65
  MODEL_TOKEN_LIMIT = {
66
  "gpt-3.5-turbo": 4096,
67
  "gpt-3.5-turbo-0301": 4096,
@@ -72,6 +74,7 @@ MODEL_TOKEN_LIMIT = {
72
  }
73
 
74
  TOKEN_OFFSET = 1000 # 模型的token上限减去这个值,得到软上限。到达软上限之后,自动尝试减少token占用。
 
75
  REDUCE_TOKEN_FACTOR = 0.5 # 与模型token上限想乘,得到目标token数。减少token占用时,将token占用减少到目标token数以下。
76
 
77
  REPLY_LANGUAGES = [
 
62
  "chatglm-6b-int4-qe"
63
  ] # 可选的模型
64
 
65
+ DEFAULT_MODEL = 0 # 默认的模型在MODELS中的序号,从0开始数
66
+
67
  MODEL_TOKEN_LIMIT = {
68
  "gpt-3.5-turbo": 4096,
69
  "gpt-3.5-turbo-0301": 4096,
 
74
  }
75
 
76
  TOKEN_OFFSET = 1000 # 模型的token上限减去这个值,得到软上限。到达软上限之后,自动尝试减少token占用。
77
+ DEFAULT_TOKEN_LIMIT = 3000 # 默认的token上限
78
  REDUCE_TOKEN_FACTOR = 0.5 # 与模型token上限想乘,得到目标token数。减少token占用时,将token占用减少到目标token数以下。
79
 
80
  REPLY_LANGUAGES = [
requirements.txt CHANGED
@@ -15,3 +15,7 @@ pdfplumber
15
  pandas
16
  transformers
17
  torch
 
 
 
 
 
15
  pandas
16
  transformers
17
  torch
18
+ mpi4py
19
+ icetk
20
+ git+https://github.com/OptimalScale/LMFlow.git
21
+ cpm-kernels