File size: 1,444 Bytes

acff406



from transformers import AutoTokenizer
from faster_chat_glm import GLM6B, FasterChatGLM


MAX_OUT_LEN = 50
BATCH_SIZE = 8
USE_CACHE = True

print("Prepare config and inputs....")
chatglm6b_dir = './models'
tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True)

input_str = ["音乐推荐应该考虑哪些因素？帮我写一篇不少于800字的方案。 ", ] * BATCH_SIZE
inputs = tokenizer(input_str, return_tensors="pt", padding=True)
input_ids = inputs.input_ids
input_ids = input_ids.to('cuda:0')
print(input_ids.shape)


print('Loading faster model...')
if USE_CACHE:
    plan_path = f'./models/glm6b-kv-cache-dy-bs{BATCH_SIZE}.ftm'
else:
    plan_path = f'./models/glm6b-bs{BATCH_SIZE}.ftm'

# kernel for chat model.
kernel = GLM6B(plan_path=plan_path,
               batch_size=BATCH_SIZE,
               num_beams=1,
               use_cache=USE_CACHE,
               num_heads=32,
               emb_size_per_heads=128,
               decoder_layers=28,
               vocab_size=150528,
               max_seq_len=MAX_OUT_LEN)
print("test")
chat = FasterChatGLM(model_dir=chatglm6b_dir, kernel=kernel).half().cuda()

# generate
sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN)
# de-tokenize model output to text
res = tokenizer.decode(sample_output[0], skip_special_tokens=True)
print(res)
res = tokenizer.decode(sample_output[BATCH_SIZE-1], skip_special_tokens=True)
print(res)