|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Evaluator for the POPE dataset (https://github.com/RUCAIBox/POPE). |
|
|
|
POPE is a binary classification dataset with ground-truth answers being either |
|
'yes' or 'no'. |
|
""" |
|
|
|
import functools |
|
|
|
import big_vision.datasets.core |
|
import big_vision.evaluators.common as c |
|
import big_vision.input_pipeline |
|
import big_vision.pp.builder |
|
import big_vision.pp.tokenizer |
|
import big_vision.utils as u |
|
|
|
|
|
|
|
|
|
API = "jit" |
|
|
|
|
|
class Evaluator: |
|
"""Evaluator for the POPE task. |
|
|
|
This evaluator expects the batch to contain a field `question_id` and a field |
|
`answer` for single ground truth or `answers` for multiple ground truths. |
|
|
|
The field names used when writting the json result can be controlled with |
|
`out_question_key` and `out_answer_key`. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
predict_fn, |
|
data, |
|
pp_fn, |
|
tokenizer, |
|
batch_size, |
|
*, |
|
devices, |
|
outfile="{workdir}/{split}.json", |
|
out_question_key="question_id", |
|
out_answer_key="answer" |
|
): |
|
|
|
self.outfile = c.resolve_outfile(outfile, split=data.get("split")) |
|
self.out_question_key = out_question_key |
|
self.out_answer_key = out_answer_key |
|
|
|
data = big_vision.datasets.core.get(**data) |
|
pp_fn = big_vision.pp.builder.get_preprocess_fn(pp_fn) |
|
self.ds, self.steps = big_vision.input_pipeline.make_for_inference( |
|
data.get_tfdata(ordered=True), |
|
pp_fn, |
|
batch_size, |
|
num_ex_per_process=data.num_examples_per_process(), |
|
) |
|
|
|
|
|
|
|
self.data_iter = big_vision.input_pipeline.start_global( |
|
self.ds, devices, keep_on_cpu={"answer", "question_id"} |
|
) |
|
|
|
self.tok = big_vision.pp.tokenizer.get_tokenizer(tokenizer) |
|
self.decode = functools.partial( |
|
predict_fn, devices=devices, eos_token=self.tok.eos_token |
|
) |
|
|
|
def run(self, train_state): |
|
"""Does one evaluation run, yields metrics.""" |
|
|
|
accuracies = [] |
|
valid = [] |
|
json_out = [] |
|
for _, batch in zip(range(self.steps), self.data_iter): |
|
|
|
tokens = self.decode(train_state, batch) |
|
|
|
|
|
tokens = u.get_local_slice_from_fsarray(tokens) |
|
ex_masks = u.get_local_slice_from_fsarray(batch["_mask"]) |
|
|
|
|
|
for i in range(len(tokens)): |
|
if ex_masks[i] == 0: |
|
continue |
|
|
|
answer = self.tok.to_str(tokens[i], stop_at_eos=True).lower() |
|
gt = batch["answer"][i] |
|
accuracies.append(float(answer == gt)) |
|
valid.append(float(answer in ("yes", "no"))) |
|
|
|
json_out.append( |
|
{ |
|
self.out_question_key: batch["question_id"][i].item(), |
|
self.out_answer_key: answer, |
|
} |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sum_accs, sum_valid, num = c.process_sum([ |
|
sum(accuracies), |
|
sum(valid), |
|
len(accuracies), |
|
]) |
|
|
|
if num: |
|
yield "acc", sum_accs / num |
|
yield "valid_percent", sum_valid / num |
|
yield "num", num |
|
|
|
c.multiprocess_write_json(self.outfile, json_out) |
|
|