Spaces:
Running
Running
File size: 8,406 Bytes
5657307 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
# Copyright 2020 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Regard measurement. """
from collections import defaultdict
from operator import itemgetter
from statistics import mean
import datasets
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import evaluate
logger = evaluate.logging.get_logger(__name__)
_CITATION = """
@article{https://doi.org/10.48550/arxiv.1909.01326,
doi = {10.48550/ARXIV.1909.01326},
url = {https://arxiv.org/abs/1909.01326},
author = {Sheng, Emily and Chang, Kai-Wei and Natarajan, Premkumar and Peng, Nanyun},
title = {The Woman Worked as a Babysitter: On Biases in Language Generation},
publisher = {arXiv},
year = {2019}
}
"""
_DESCRIPTION = """\
Regard aims to measure language polarity towards and social perceptions of a demographic (e.g. gender, race, sexual orientation).
"""
_KWARGS_DESCRIPTION = """
Compute the regard of the input sentences.
Args:
`data` (list of str): prediction/candidate sentences, e.g. sentences describing a given demographic group.
`references` (list of str) (optional): reference/comparison sentences, e.g. sentences describing a different demographic group to compare against.
`aggregation` (str) (optional): determines the type of aggregation performed.
If set to `None`, the difference between the regard scores for the two categories is returned.
Otherwise:
- 'average' : returns the average regard for each category (negative, positive, neutral, other) for each group
- 'maximum': returns the maximum regard for each group
Returns:
With only `data` as input (default config):
`regard` : the regard scores of each string in the input list (if no aggregation is specified)
`average_regard`: the average regard for each category (negative, positive, neutral, other) (if `aggregation` = `average`)
`max_regard`: the maximum regard across all input strings (if `aggregation` = `maximum`)
With `data` and `references` as input (`compare` config):
`regard_difference`: the difference between the regard scores for the two groups (if no aggregation is specified)
`average_data_regard` and 'average_references_regard': the average regard for each category (negative, positive, neutral, other) (if `aggregation` = `average`)
`max_data_regard` and 'max_references_regard': the maximum regard for each group (if `aggregation` = `maximum`)
Examples:
Example 1 (single input):
>>> regard = evaluate.load("regard")
>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
>>> results = regard.compute(data = group1)
>>> for d in results['regard']:
... print({l['label']: round(l['score'],2) for l in d})
{'neutral': 0.95, 'positive': 0.02, 'negative': 0.02, 'other': 0.01}
{'negative': 0.97, 'other': 0.02, 'neutral': 0.01, 'positive': 0.0}
Example 2 (comparison mode):
>>> regard = evaluate.load("regard", "compare")
>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
>>> results = regard.compute(data = group1, references = group2)
>>> print({k: round(v, 2) for k, v in results['regard_difference'].items()})
{'neutral': 0.46, 'positive': 0.01, 'negative': -0.46, 'other': -0.01}
Example 3 (returns the maximum regard score per category):
>>> regard = evaluate.load("regard", "compare")
>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
>>> results = regard.compute(data = group1, references = group2, aggregation = "maximum")
>>> print({k: round(v, 2) for k, v in results['max_data_regard'].items()})
{'neutral': 0.95, 'positive': 0.02, 'negative': 0.97, 'other': 0.02}
>>> print({k: round(v, 2) for k, v in results['max_references_regard'].items()})
{'negative': 0.98, 'other': 0.04, 'neutral': 0.03, 'positive': 0.0}
Example 4 (returns the average regard score):
>>> regard = evaluate.load("regard", "compare")
>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
>>> results = regard.compute(data = group1, references = group2, aggregation = "average")
>>> print({k: round(v, 2) for k, v in results['average_data_regard'].items()})
{'neutral': 0.48, 'positive': 0.01, 'negative': 0.5, 'other': 0.01}
>>> print({k: round(v, 2) for k, v in results['average_references_regard'].items()})
{'negative': 0.96, 'other': 0.02, 'neutral': 0.02, 'positive': 0.0}
"""
def regard(group, regard_classifier):
group_scores = defaultdict(list)
group_regard = regard_classifier(group)
for pred in group_regard:
for pred_score in pred:
group_scores[pred_score["label"]].append(pred_score["score"])
return group_regard, dict(group_scores)
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Regard(evaluate.Measurement):
def _info(self):
if self.config_name not in ["compare", "default"]:
raise KeyError("You should supply a configuration name selected in " '["config", "default"]')
return evaluate.MeasurementInfo(
module_type="measurement",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"data": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
if self.config_name == "compare"
else {
"data": datasets.Value("string", id="sequence"),
}
),
codebase_urls=[],
reference_urls=[],
)
def _download_and_prepare(self, dl_manager):
regard_tokenizer = AutoTokenizer.from_pretrained("sasha/regardv3")
regard_model = AutoModelForSequenceClassification.from_pretrained("sasha/regardv3")
self.regard_classifier = pipeline(
"text-classification", model=regard_model, top_k=4, tokenizer=regard_tokenizer, truncation=True
)
def _compute(
self,
data,
references=None,
aggregation=None,
):
if self.config_name == "compare":
pred_scores, pred_regard = regard(data, self.regard_classifier)
ref_scores, ref_regard = regard(references, self.regard_classifier)
pred_mean = {k: mean(v) for k, v in pred_regard.items()}
pred_max = {k: max(v) for k, v in pred_regard.items()}
ref_mean = {k: mean(v) for k, v in ref_regard.items()}
ref_max = {k: max(v) for k, v in ref_regard.items()}
if aggregation == "maximum":
return {
"max_data_regard": pred_max,
"max_references_regard": ref_max,
}
elif aggregation == "average":
return {"average_data_regard": pred_mean, "average_references_regard": ref_mean}
else:
return {"regard_difference": {key: pred_mean[key] - ref_mean.get(key, 0) for key in pred_mean}}
else:
pred_scores, pred_regard = regard(data, self.regard_classifier)
pred_mean = {k: mean(v) for k, v in pred_regard.items()}
pred_max = {k: max(v) for k, v in pred_regard.items()}
if aggregation == "maximum":
return {"max_regard": pred_max}
elif aggregation == "average":
return {"average_regard": pred_mean}
else:
return {"regard": pred_scores}
|