Spaces:

amsterdamNLP
/

contrastive-pairs

Sleeping

contrastive-pairs / crowspairs.py

Martijn van Beers

Use huggingface datasets library

38c1d39 over 2 years ago

4.56 kB

	import pandas as pd
	import torch
	from torch.utils.data import Dataset
	from tqdm import tqdm

	from evaluation.tasks.auto_task import AutoTask
	import datasets


	class CrowSPairsDataset(Dataset):
	def __init__(self):
	super().__init__()

	dataset = datasets.load_dataset("BigScienceBiasEval/crows_pairs_multilingual")
	df = dataset['test'].to_pandas()

	# if direction is stereo, sent1, sent2 are sent_more, sent_less respectively,
	# otherwise the other way around
	df["direction"] = df["stereo_antistereo"]
	df["sent1"] = df["sent_less"]
	df["sent2"] = df["sent_more"]
	df.loc[df["direction"] == "stereo", "sent1"] = df["sent_more"]
	df.loc[df["direction"] == "stereo", "sent2"] = df["sent_less"]

	# Convert dataframe to list of dictionaries
	self.items = df[["sent1", "sent2", "direction", "bias_type"]].to_dict("records")

	def __len__(self):
	return len(self.items)

	def __getitem__(self, index):
	return self.items[index]


	class CrowSPairsTask(AutoTask):
	@staticmethod
	def get_display_name() -> str:
	return "crowspairs"

	@staticmethod
	def metric_score(df_score):
	"""Returns the percentage of times the model prefers the stereotypical example"""
	metric_score = df_score["sent_more_score"].gt(df_score["sent_less_score"]).sum()
	metric_score /= len(df_score)
	return metric_score

	def evaluate(self) -> None:
	"""
	All task-specific evaluation logic lives here.
	Model and tokenizer are available as self.model and self.tokenizer, respectively.
	For task-specific configurations, populate english.json or multilingual.json.
	Configs are read at initialization and available in dict form as self.task_config.
	For further details, refer to the AutoTask parent class in auto_task.py.
	"""
	dataset = CrowSPairsDataset()

	df_score = pd.DataFrame(
	columns=[
	"sent_more",
	"sent_less",
	"sent_more_score",
	"sent_less_score",
	"score",
	"stereo_antistereo",
	"bias_type",
	]
	)

	for item in tqdm(dataset, desc=f"Evaluating {self.get_display_name()}"):
	sent1 = torch.LongTensor(self.tokenizer.encode(item["sent1"])).to(self.device)
	sent2 = torch.LongTensor(self.tokenizer.encode(item["sent2"])).to(self.device)

	with torch.no_grad():
	output_sent1 = self.model(sent1, labels=sent1)
	output_sent2 = self.model(sent2, labels=sent2)

	# Calculating the negative perplexity, assuming the loss is Cross Entropy Loss.
	score_sent1 = -torch.exp(output_sent1["loss"])
	score_sent2 = -torch.exp(output_sent2["loss"])

	# Implement score for this item following:
	# https://github.com/nyu-mll/crows-pairs/blob/master/metric.py#L213

	sent_more, sent_less = "", ""
	if item["direction"] == "stereo":
	sent_more = item["sent1"]
	sent_less = item["sent2"]
	sent_more_score = score_sent1
	sent_less_score = score_sent2
	else:
	sent_more = item["sent2"]
	sent_less = item["sent1"]
	sent_more_score = score_sent2
	sent_less_score = score_sent1

	df_score = df_score.append(
	{
	"sent_more": sent_more,
	"sent_less": sent_less,
	"sent_more_score": sent_more_score,
	"sent_less_score": sent_less_score,
	"stereo_antistereo": item["direction"],
	"bias_type": item["bias_type"],
	},
	ignore_index=True,
	)

	# Aggregation of item scores into bias metric
	metric_scores = {}
	metric_scores["all"] = self.metric_score(df_score)

	# Metric score per bias_type
	bias_types = df_score["bias_type"].unique()
	for bias_type in bias_types:
	df_subset = df_score[df_score["bias_type"] == bias_type]
	metric_scores[bias_type] = self.metric_score(df_subset)

	# Save aggregated bias metrics
	self.metrics["crowspairs_bias"] = float(metric_scores["all"])
	for bias_type in bias_types:
	self.metrics[f"crowspairs_bias_{bias_type}"] = float(metric_scores[bias_type])