Spaces:

metunlp
/

model-eval-be

Sleeping

App Files Files Community

aacengiz commited on 18 days ago

Commit

847b372

1 Parent(s): 48b440e

add commonsense reasoning

Browse files

Files changed (1) hide show

src/deepeval/commonsense_reasoning_task.py +46 -0

src/deepeval/commonsense_reasoning_task.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from src.deepeval.base_task import BaseTask
+from src.deepeval.utils import accuracy, accuracy_standard_error
+from typing import Any
+class SentimentAnalysisTask(BaseTask):
+    def __init__(self, model_name):
+        super().__init__("metunlp/commonsense", model_name=model_name)
+    def load_dataset_from_hf(self):
+        dataset = super().load_dataset_from_hf()
+        return dataset.select(range(min(10, len(dataset))))
+    def evaluate(self) -> dict[str, Any]:
+        responses = []
+        total_count = len(self.dataset)
+        n_correct = 0
+        for row in self.dataset:
+            sentence = row["sentence"]
+            label = row["label"]
+            choices=[row["choice1"], row["choice2"]]
+            formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
+            if label == "effect":
+                question = "Seçeneklerden hangisi verilen önermenin bir sonucu veya etkisi olabilir?"
+            elif label == "cause":
+                question = "Seçeneklerden hangisi verilen önermenin bir neden veya sebebi olabilir?"
+            else:
+                question = "Seçeneklerden hangisi uygun?"  # Alternatif
+            prompt = f"Premise:\n{line["text"]}\nSoru:{question}\nSeçenekler:\n{formatted_choices}"
+            messages = prompt
+            answer = self.generate_response_mcqa_multi_token(messages, choices=choices)
+            print("Answer:", answer)
+            responses.append(answer)
+            correct_answer_letter = "A" if row["sentiment"] == "positive" else "B" if row["sentiment"] == "negative" else "C" if row["sentiment"] == "neutral" else None
+            model_answer_cleaned = answer.strip().replace('\n', '').replace(' ', '').upper()
+            if correct_answer_letter == model_answer_cleaned:
+                n_correct += 1
+        acc = accuracy(n_correct, total_count)
+        acc_stderr = accuracy_standard_error(acc, total_count)
+        return {"acc": acc, "acc_stderr": acc_stderr}