Spaces:
Running
Running
File size: 4,613 Bytes
f1d7582 ddd5852 1698f0a ddd5852 f1d7582 1698f0a f1d7582 1698f0a f1d7582 1698f0a f1d7582 1698f0a f1d7582 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
# Copyright 2022 The HuggingFace Datasets Authors and the current metric script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""FrugalScore metric."""
import datasets
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import evaluate
_CITATION = """\
@article{eddine2021frugalscore,
title={FrugalScore: Learning Cheaper, Lighter and Faster Evaluation Metrics for Automatic Text Generation},
author={Eddine, Moussa Kamal and Shang, Guokan and Tixier, Antoine J-P and Vazirgiannis, Michalis},
journal={arXiv preprint arXiv:2110.08559},
year={2021}
}
"""
_DESCRIPTION = """\
FrugalScore is a reference-based metric for NLG models evaluation. It is based on a distillation approach that allows to learn a fixed, low cost version of any expensive NLG metric, while retaining most of its original performance.
"""
_KWARGS_DESCRIPTION = """
Calculates how good are predictions given some references, using certain scores.
Args:
predictions (list of str): list of predictions to score. Each predictions
should be a string.
references (list of str): list of reference for each prediction. Each
reference should be a string.
batch_size (int): the batch size for predictions.
max_length (int): maximum sequence length.
device (str): either gpu or cpu
Returns:
scores (list of int): list of scores.
Examples:
>>> frugalscore = evaluate.load("frugalscore")
>>> results = frugalscore.compute(predictions=['hello there', 'huggingface'], references=['hello world', 'hugging face'])
>>> print([round(s, 3) for s in results["scores"]])
[0.631, 0.645]
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class FRUGALSCORE(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string"),
"references": datasets.Value("string"),
}
),
homepage="https://github.com/moussaKam/FrugalScore",
)
def _download_and_prepare(self, dl_manager):
if self.config_name == "default":
checkpoint = "moussaKam/frugalscore_tiny_bert-base_bert-score"
else:
checkpoint = self.config_name
self.model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def _compute(
self,
predictions,
references,
batch_size=32,
max_length=128,
device=None,
):
"""Returns the scores"""
assert len(predictions) == len(
references
), "predictions and references should have the same number of sentences."
if device is not None:
assert device in ["gpu", "cpu"], "device should be either gpu or cpu."
else:
device = "gpu" if torch.cuda.is_available() else "cpu"
training_args = TrainingArguments(
"trainer",
fp16=(device == "gpu"),
per_device_eval_batch_size=batch_size,
report_to="all",
no_cuda=(device == "cpu"),
log_level="warning",
)
dataset = {"sentence1": predictions, "sentence2": references}
raw_datasets = datasets.Dataset.from_dict(dataset)
def tokenize_function(data):
return self.tokenizer(
data["sentence1"], data["sentence2"], max_length=max_length, truncation=True, padding=True
)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets.remove_columns(["sentence1", "sentence2"])
trainer = Trainer(self.model, training_args, tokenizer=self.tokenizer)
predictions = trainer.predict(tokenized_datasets)
return {"scores": list(predictions.predictions.squeeze(-1))}
|