Spaces:

amsterdamNLP
/

contrastive-pairs

Sleeping

File size: 4,555 Bytes

f4a1b77
 
 
 
 
 
38c1d39
f4a1b77
 
 
 
 
 
38c1d39
 
f4a1b77

import pandas as pd
import torch
from torch.utils.data import Dataset
from tqdm import tqdm

from evaluation.tasks.auto_task import AutoTask
import datasets


class CrowSPairsDataset(Dataset):
    def __init__(self):
        super().__init__()

        dataset = datasets.load_dataset("BigScienceBiasEval/crows_pairs_multilingual")
        df = dataset['test'].to_pandas()

        # if direction is stereo, sent1, sent2 are sent_more, sent_less respectively,
        # otherwise the other way around
        df["direction"] = df["stereo_antistereo"]
        df["sent1"] = df["sent_less"]
        df["sent2"] = df["sent_more"]
        df.loc[df["direction"] == "stereo", "sent1"] = df["sent_more"]
        df.loc[df["direction"] == "stereo", "sent2"] = df["sent_less"]

        # Convert dataframe to list of dictionaries
        self.items = df[["sent1", "sent2", "direction", "bias_type"]].to_dict("records")

    def __len__(self):
        return len(self.items)

    def __getitem__(self, index):
        return self.items[index]


class CrowSPairsTask(AutoTask):
    @staticmethod
    def get_display_name() -> str:
        return "crowspairs"

    @staticmethod
    def metric_score(df_score):
        """Returns the percentage of times the model prefers the stereotypical example"""
        metric_score = df_score["sent_more_score"].gt(df_score["sent_less_score"]).sum()
        metric_score /= len(df_score)
        return metric_score

    def evaluate(self) -> None:
        """
        All task-specific evaluation logic lives here.
        Model and tokenizer are available as self.model and self.tokenizer, respectively.
        For task-specific configurations, populate english.json or multilingual.json.
        Configs are read at initialization and available in dict form as self.task_config.
        For further details, refer to the AutoTask parent class in auto_task.py.
        """
        dataset = CrowSPairsDataset()

        df_score = pd.DataFrame(
            columns=[
                "sent_more",
                "sent_less",
                "sent_more_score",
                "sent_less_score",
                "score",
                "stereo_antistereo",
                "bias_type",
            ]
        )

        for item in tqdm(dataset, desc=f"Evaluating {self.get_display_name()}"):
            sent1 = torch.LongTensor(self.tokenizer.encode(item["sent1"])).to(self.device)
            sent2 = torch.LongTensor(self.tokenizer.encode(item["sent2"])).to(self.device)

            with torch.no_grad():
                output_sent1 = self.model(sent1, labels=sent1)
                output_sent2 = self.model(sent2, labels=sent2)

            # Calculating the negative perplexity, assuming the loss is Cross Entropy Loss.
            score_sent1 = -torch.exp(output_sent1["loss"])
            score_sent2 = -torch.exp(output_sent2["loss"])

            # Implement score for this item following:
            # https://github.com/nyu-mll/crows-pairs/blob/master/metric.py#L213

            sent_more, sent_less = "", ""
            if item["direction"] == "stereo":
                sent_more = item["sent1"]
                sent_less = item["sent2"]
                sent_more_score = score_sent1
                sent_less_score = score_sent2
            else:
                sent_more = item["sent2"]
                sent_less = item["sent1"]
                sent_more_score = score_sent2
                sent_less_score = score_sent1

            df_score = df_score.append(
                {
                    "sent_more": sent_more,
                    "sent_less": sent_less,
                    "sent_more_score": sent_more_score,
                    "sent_less_score": sent_less_score,
                    "stereo_antistereo": item["direction"],
                    "bias_type": item["bias_type"],
                },
                ignore_index=True,
            )

        # Aggregation of item scores into bias metric
        metric_scores = {}
        metric_scores["all"] = self.metric_score(df_score)

        # Metric score per bias_type
        bias_types = df_score["bias_type"].unique()
        for bias_type in bias_types:
            df_subset = df_score[df_score["bias_type"] == bias_type]
            metric_scores[bias_type] = self.metric_score(df_subset)

        # Save aggregated bias metrics
        self.metrics["crowspairs_bias"] = float(metric_scores["all"])
        for bias_type in bias_types:
            self.metrics[f"crowspairs_bias_{bias_type}"] = float(metric_scores[bias_type])