# from transformers import AutoTokenizer, pipeline, logging | |
# from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig | |
# model_name_or_path = "asyafiqe/Merak-7B-v3-Mini-Orca-Indo-GPTQ" | |
# model_basename = "Merak-7B-v3-Mini-Orca-Indo-GPTQ" | |
# use_triton = False | |
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) | |
# model = AutoGPTQForCausalLM.from_quantized(model_name_or_path, | |
# model_basename=model_basename, | |
# use_safetensors=True, | |
# trust_remote_code=True, | |
# device="cuda:0", | |
# use_triton=use_triton, | |
# quantize_config=None) | |
# def predict(prompt): | |
# # prompt = "Buat rencana untuk menghemat listrik di rumah" | |
# system_message = "Anda adalah asisten AI. Anda akan diberi tugas. Anda harus menghasilkan jawaban yang rinci dan panjang.\n" | |
# prompt_template=f'''SYSTEM: {system_message} | |
# USER: {prompt} | |
# ASSISTANT: ''' | |
# print("\n\n*** Generate:") | |
# input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda() | |
# output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512) | |
# print(tokenizer.decode(output[0])) | |
# # Inference can also be done using transformers' pipeline | |
# # Prevent printing spurious transformers error when using pipeline with AutoGPTQ | |
# logging.set_verbosity(logging.CRITICAL) | |
# print("*** Pipeline:") | |
# pipe = pipeline( | |
# "text-generation", | |
# model=model, | |
# tokenizer=tokenizer, | |
# max_new_tokens=512, | |
# temperature=0.7, | |
# top_p=0.95, | |
# repetition_penalty=1.15 | |
# ) | |
# result = pipe(prompt_template)[0]['generated_text'] | |
# return result |