File size: 1,760 Bytes
8578816 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
# from transformers import AutoTokenizer, pipeline, logging
# from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
# model_name_or_path = "asyafiqe/Merak-7B-v3-Mini-Orca-Indo-GPTQ"
# model_basename = "Merak-7B-v3-Mini-Orca-Indo-GPTQ"
# use_triton = False
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
# model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
# model_basename=model_basename,
# use_safetensors=True,
# trust_remote_code=True,
# device="cuda:0",
# use_triton=use_triton,
# quantize_config=None)
# def predict(prompt):
# # prompt = "Buat rencana untuk menghemat listrik di rumah"
# system_message = "Anda adalah asisten AI. Anda akan diberi tugas. Anda harus menghasilkan jawaban yang rinci dan panjang.\n"
# prompt_template=f'''SYSTEM: {system_message}
# USER: {prompt}
# ASSISTANT: '''
# print("\n\n*** Generate:")
# input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
# output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
# print(tokenizer.decode(output[0]))
# # Inference can also be done using transformers' pipeline
# # Prevent printing spurious transformers error when using pipeline with AutoGPTQ
# logging.set_verbosity(logging.CRITICAL)
# print("*** Pipeline:")
# pipe = pipeline(
# "text-generation",
# model=model,
# tokenizer=tokenizer,
# max_new_tokens=512,
# temperature=0.7,
# top_p=0.95,
# repetition_penalty=1.15
# )
# result = pipe(prompt_template)[0]['generated_text']
# return result |