dragon-yi-6B-v0-GPTQ / generation_test_hf_script.py
TheBloke's picture
GPTQ model commit
4c30d2e
raw
history blame
2.64 kB
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
def load_rag_benchmark_tester_ds():
# pull 200 question rag benchmark test dataset from LLMWare HuggingFace repo
from datasets import load_dataset
ds_name = "llmware/rag_instruct_benchmark_tester"
dataset = load_dataset(ds_name)
print("update: loading RAG Benchmark test dataset - ", dataset)
test_set = []
for i, samples in enumerate(dataset["train"]):
test_set.append(samples)
# to view test set samples
# print("rag benchmark dataset test samples: ", i, samples)
return test_set
def run_test(model_name, test_ds):
device = "cuda" if torch.cuda.is_available() else "cpu"
print("\nRAG Performance Test - 200 questions")
print("update: model - ", model_name)
print("update: device - ", device)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
for i, entries in enumerate(test_ds):
# prepare prompt packaging used in fine-tuning process
# note: in our testing, Yi model performed better with trailing "\n" at end of prompt
new_prompt = "<human>: " + entries["context"] + "\n" + entries["query"] + "\n" + "<bot>:" + "\n"
inputs = tokenizer(new_prompt, return_tensors="pt")
start_of_output = len(inputs.input_ids[0])
# temperature: set at 0.3 for consistency of output
# max_new_tokens: set at 100 - may prematurely stop a few of the summaries
outputs = model.generate(
inputs.input_ids.to(device),
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.eos_token_id,
do_sample=True,
temperature=0.3,
max_new_tokens=100,
)
output_only = tokenizer.decode(outputs[0][start_of_output:],skip_special_tokens=True)
# quick/optional post-processing clean-up of potential fine-tuning artifacts
eot = output_only.find("<|endoftext|>")
if eot > -1:
output_only = output_only[:eot]
bot = output_only.find("<bot>:")
if bot > -1:
output_only = output_only[bot+len("<bot>:"):]
# end - post-processing
print("\n")
print(i, "llm_response - ", output_only)
print(i, "gold_answer - ", entries["answer"])
return 0
if __name__ == "__main__":
test_ds = load_rag_benchmark_tester_ds()
model_name = "llmware/dragon-yi-6b-v0"
output = run_test(model_name,test_ds)