zxcgqq
/

ceshi2

Model card Files Files and versions Community

ceshi2 / loader.py

zxcgqq

Upload 8 files

b5e593e about 2 years ago

raw

history blame contribute delete

6.59 kB

	import os
	from typing import Dict, List, Optional, Tuple, Union

	import torch
	from fastchat.conversation import (compute_skip_echo_len,
	get_default_conv_template)
	from fastchat.serve.inference import load_model as load_fastchat_model
	from langchain.llms.base import LLM
	from langchain.llms.utils import enforce_stop_tokens
	from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer


	MODEL_CACHE_PATH = os.path.join(os.path.dirname(__file__), 'model_cache')

	llm_model_dict = {
	"chatglm": {
	"ChatGLM-6B": "THUDM/chatglm-6b",
	"ChatGLM-6B-int4": "THUDM/chatglm-6b-int4",
	"ChatGLM-6B-int8": "THUDM/chatglm-6b-int8",
	"ChatGLM-6b-int4-qe": "THUDM/chatglm-6b-int4-qe"
	},
	"belle": {
	"BELLE-LLaMA-Local": "/pretrainmodel/belle",
	},
	"vicuna": {
	"Vicuna-Local": "/pretrainmodel/vicuna",
	}
	}

	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
	DEVICE_ID = "0"
	CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE


	def torch_gc():
	if torch.cuda.is_available():
	with torch.cuda.device(CUDA_DEVICE):
	torch.cuda.empty_cache()
	torch.cuda.ipc_collect()


	def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
	num_trans_layers = 28
	per_gpu_layers = 30 / num_gpus

	device_map = {
	'transformer.word_embeddings': 0,
	'transformer.final_layernorm': 0,
	'lm_head': 0
	}

	used = 2
	gpu_target = 0
	for i in range(num_trans_layers):
	if used >= per_gpu_layers:
	gpu_target += 1
	used = 0
	assert gpu_target < num_gpus
	device_map[f'transformer.layers.{i}'] = gpu_target
	used += 1

	return device_map


	class ChatLLM(LLM):
	max_token: int = 10000
	temperature: float = 0.1
	top_p = 0.9
	history = []
	model_type: str = "chatglm"
	model_name_or_path: str = "ChatGLM-6B-int4",
	tokenizer: object = None
	model: object = None

	def __init__(self):
	super().__init__()

	@property
	def _llm_type(self) -> str:
	return "ChatLLM"

	def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:

	if self.model_type == 'vicuna':
	conv = get_default_conv_template(self.model_name_or_path).copy()
	conv.append_message(conv.roles[0], prompt)
	conv.append_message(conv.roles[1], None)
	prompt = conv.get_prompt()
	inputs = self.tokenizer([prompt])
	output_ids = self.model.generate(
	torch.as_tensor(inputs.input_ids).cuda(),
	do_sample=True,
	temperature=self.temperature,
	max_new_tokens=self.max_token,
	)
	outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
	skip_echo_len = compute_skip_echo_len(self.model_name_or_path, conv, prompt)
	response = outputs[skip_echo_len:]
	torch_gc()
	if stop is not None:
	response = enforce_stop_tokens(response, stop)
	self.history = [[None, response]]

	elif self.model_type == 'belle':
	prompt = "Human: "+ prompt +" \n\nAssistant: "
	input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(DEVICE)
	generate_ids = self.model.generate(input_ids, max_new_tokens=self.max_token, do_sample = True, top_k = 30, top_p = self.top_p, temperature = self.temperature, repetition_penalty=1., eos_token_id=2, bos_token_id=1, pad_token_id=0)
	output = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
	response = output[len(prompt)+1:]
	torch_gc()
	if stop is not None:
	response = enforce_stop_tokens(response, stop)
	self.history = [[None, response]]

	elif self.model_type == 'chatglm':
	response, _ = self.model.chat(
	self.tokenizer,
	prompt,
	history=self.history,
	max_length=self.max_token,
	temperature=self.temperature,
	)
	torch_gc()
	if stop is not None:
	response = enforce_stop_tokens(response, stop)
	self.history = self.history + [[None, response]]

	return response


	def load_llm(self,
	llm_device=DEVICE,
	num_gpus=torch.cuda.device_count(),
	device_map: Optional[Dict[str, int]] = None,
	**kwargs):
	if 'chatglm' in self.model_name_or_path.lower():
	self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path,
	trust_remote_code=True, cache_dir=os.path.join(MODEL_CACHE_PATH, self.model_name_or_path))
	if torch.cuda.is_available() and llm_device.lower().startswith("cuda"):

	num_gpus = torch.cuda.device_count()
	if num_gpus < 2 and device_map is None:
	self.model = (AutoModel.from_pretrained(
	self.model_name_or_path, trust_remote_code=True, cache_dir=os.path.join(MODEL_CACHE_PATH, self.model_name_or_path),
	**kwargs).half().cuda())
	else:
	from accelerate import dispatch_model

	model = AutoModel.from_pretrained(self.model_name_or_path,
	trust_remote_code=True, cache_dir=os.path.join(MODEL_CACHE_PATH, self.model_name_or_path),
	**kwargs).half()

	if device_map is None:
	device_map = auto_configure_device_map(num_gpus)

	self.model = dispatch_model(model, device_map=device_map)
	else:
	self.model = (AutoModel.from_pretrained(
	self.model_name_or_path,
	trust_remote_code=True, cache_dir=os.path.join(MODEL_CACHE_PATH, self.model_name_or_path)).float().to(llm_device))
	self.model = self.model.eval()

	else:
	self.model, self.tokenizer = load_fastchat_model(
	model_path = self.model_name_or_path,
	device = llm_device,
	num_gpus = num_gpus
	)