|
import math
|
|
import os
|
|
import sys
|
|
|
|
import fire
|
|
from tqdm import tqdm
|
|
import pandas as pd
|
|
import torch
|
|
import transformers
|
|
from peft import PeftModel
|
|
import datasets
|
|
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
|
|
|
|
from utils.callbacks import Iteratorize, Stream
|
|
from utils.prompter import Prompter
|
|
|
|
device = "cuda"
|
|
|
|
|
|
def main(
|
|
load_8bit: bool = True,
|
|
base_model: str = "decapoda-research/llama-7b-hf",
|
|
lora_weights: str = "./lora-alpaca",
|
|
data_path: str = "./data",
|
|
output_path: str = "./output",
|
|
eval_rate: float = 0.1,
|
|
batch_size: int = 32,
|
|
|
|
prompt_template: str = "alpaca",
|
|
):
|
|
base_model = base_model or os.environ.get("BASE_MODEL", "")
|
|
assert (base_model), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"
|
|
|
|
prompter = Prompter(prompt_template)
|
|
tokenizer = LlamaTokenizer.from_pretrained(base_model)
|
|
if device == "cuda":
|
|
model = LlamaForCausalLM.from_pretrained(
|
|
base_model,
|
|
load_in_8bit=load_8bit,
|
|
torch_dtype=torch.float16,
|
|
device_map="auto",
|
|
)
|
|
model = PeftModel.from_pretrained(
|
|
model,
|
|
lora_weights,
|
|
torch_dtype=torch.float16,
|
|
)
|
|
|
|
|
|
model.config.pad_token_id = tokenizer.pad_token_id = 0
|
|
model.config.bos_token_id = 1
|
|
model.config.eos_token_id = 2
|
|
|
|
if not load_8bit:
|
|
model.half()
|
|
|
|
model.eval()
|
|
if torch.__version__ >= "2" and sys.platform != "win32":
|
|
model = torch.compile(model)
|
|
|
|
def evaluate_one(
|
|
instruction,
|
|
input=None,
|
|
temperature=0.1,
|
|
top_p=0.75,
|
|
top_k=40,
|
|
num_beams=2,
|
|
max_new_tokens=128,
|
|
**kwargs,
|
|
):
|
|
prompt = prompter.generate_prompt(instruction, input)
|
|
inputs = tokenizer(prompt, return_tensors="pt")
|
|
input_ids = inputs["input_ids"].to(device)
|
|
generation_config = GenerationConfig(
|
|
temperature=temperature,
|
|
top_p=top_p,
|
|
top_k=top_k,
|
|
num_beams=num_beams,
|
|
**kwargs,
|
|
)
|
|
|
|
|
|
with torch.no_grad():
|
|
generation_output = model.generate(
|
|
input_ids=input_ids,
|
|
generation_config=generation_config,
|
|
return_dict_in_generate=True,
|
|
output_scores=True,
|
|
max_new_tokens=max_new_tokens,
|
|
)
|
|
s = generation_output.sequences[0]
|
|
output = tokenizer.decode(s, skip_special_tokens=True)
|
|
return prompter.get_response(output)
|
|
|
|
def evaluate_all():
|
|
|
|
|
|
|
|
df = pd.read_json(data_path, orient='records')
|
|
print(df.info())
|
|
|
|
correct = 0
|
|
total = 0
|
|
total_step = len(df)
|
|
pbar = tqdm(total=total_step, unit='batch')
|
|
error = []
|
|
for i in range(total_step):
|
|
instruction = df['instruction'].iloc[i]
|
|
input = df['input'].iloc[i]
|
|
label = df['output'].iloc[i]
|
|
pred = evaluate_one(instruction=instruction, input=input)
|
|
if pred == label:
|
|
correct += 1
|
|
else:
|
|
error.append((label, pred))
|
|
total += 1
|
|
acc = correct / total
|
|
|
|
|
|
pbar.set_description(
|
|
f"Testing: Sample [{total}/{total_step}] Acc: {acc :.4f}")
|
|
pbar.update(1)
|
|
|
|
for e in error:
|
|
print(e)
|
|
|
|
def evaluate_by_batch(
|
|
temperature=0.1,
|
|
top_p=0.75,
|
|
top_k=40,
|
|
num_beams=1,
|
|
max_new_tokens=32
|
|
):
|
|
df = pd.read_json(data_path, orient='records')
|
|
|
|
df['prompt'] = df.apply(lambda x: prompter.generate_prompt(
|
|
x['instruction'], x['input']), axis=1)
|
|
tokenizer.padding_side = "left"
|
|
|
|
generation_config = GenerationConfig(
|
|
temperature=temperature,
|
|
top_p=top_p,
|
|
top_k=top_k,
|
|
num_beams=num_beams
|
|
)
|
|
|
|
outputs = []
|
|
total = 0
|
|
total_step = math.ceil(len(df) / batch_size)
|
|
pbar = tqdm(total=total_step, unit='batch')
|
|
|
|
with torch.no_grad():
|
|
for i in range(total_step):
|
|
batch = df.iloc[i*batch_size:(i+1)*batch_size]
|
|
inputs = tokenizer(batch['prompt'].tolist(), return_tensors="pt", padding=True)[
|
|
'input_ids'].to(device)
|
|
|
|
generation_outputs = model.generate(
|
|
input_ids=inputs,
|
|
generation_config=generation_config,
|
|
max_new_tokens=max_new_tokens,
|
|
pad_token_id=tokenizer.pad_token_id
|
|
)
|
|
|
|
for g in generation_outputs:
|
|
decoded_item = tokenizer.decode(
|
|
g, skip_special_tokens=True)
|
|
try:
|
|
output = prompter.get_response(decoded_item)
|
|
except:
|
|
output = decoded_item
|
|
outputs.append(output)
|
|
total += 1
|
|
|
|
|
|
pbar.set_description(f"Testing: Sample [{total}/{len(df)}] ")
|
|
pbar.update(1)
|
|
df['pred'] = outputs
|
|
df['pred'].to_csv(output_path, index=False)
|
|
|
|
evaluate_by_batch()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import yaml
|
|
dataset_param = sys.argv[1]
|
|
with open("./configs/evaluate_params.yaml", "r") as stream:
|
|
|
|
params = yaml.safe_load(stream)
|
|
print('=' * 80)
|
|
print(params[dataset_param])
|
|
print('=' * 80)
|
|
|
|
|
|
main(**params[dataset_param])
|
|
|