baLLseM / model /llm.py
hqms's picture
initial commit
8578816
# from transformers import AutoTokenizer, pipeline, logging
# from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
# model_name_or_path = "asyafiqe/Merak-7B-v3-Mini-Orca-Indo-GPTQ"
# model_basename = "Merak-7B-v3-Mini-Orca-Indo-GPTQ"
# use_triton = False
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
# model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
# model_basename=model_basename,
# use_safetensors=True,
# trust_remote_code=True,
# device="cuda:0",
# use_triton=use_triton,
# quantize_config=None)
# def predict(prompt):
# # prompt = "Buat rencana untuk menghemat listrik di rumah"
# system_message = "Anda adalah asisten AI. Anda akan diberi tugas. Anda harus menghasilkan jawaban yang rinci dan panjang.\n"
# prompt_template=f'''SYSTEM: {system_message}
# USER: {prompt}
# ASSISTANT: '''
# print("\n\n*** Generate:")
# input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
# output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
# print(tokenizer.decode(output[0]))
# # Inference can also be done using transformers' pipeline
# # Prevent printing spurious transformers error when using pipeline with AutoGPTQ
# logging.set_verbosity(logging.CRITICAL)
# print("*** Pipeline:")
# pipe = pipeline(
# "text-generation",
# model=model,
# tokenizer=tokenizer,
# max_new_tokens=512,
# temperature=0.7,
# top_p=0.95,
# repetition_penalty=1.15
# )
# result = pipe(prompt_template)[0]['generated_text']
# return result