OG3850 commited on
Commit
82a69da
·
1 Parent(s): ec9c39a
Files changed (2) hide show
  1. src/deepeval/bias.py +0 -106
  2. src/deepeval/bias_task.py +2 -2
src/deepeval/bias.py DELETED
@@ -1,106 +0,0 @@
1
- from src.deepeval.base_task import BaseTask
2
- from collections import defaultdict
3
- from src.deepeval.utils import accuracy, accuracy_standard_error
4
- from typing import Any
5
- import ast
6
-
7
-
8
- class BiasTask(BaseTask):
9
- def __init__(self, model_name):
10
- super().__init__("metunlp/sosyoloji_bias", model_name=model_name)
11
-
12
- <<<<<<< HEAD
13
- def load_dataset_from_hf(self):
14
- dataset = super().load_dataset_from_hf()
15
- return dataset
16
- =======
17
- def load_dataset_lmjudge_from_hf(self):
18
- dataset = super().load_dataset_lmjudge_from_hf()
19
- return dataset.select(range(min(1, len(dataset))))
20
- >>>>>>> 2dca79c (customize the data count)
21
-
22
-
23
- def evaluate(self) -> dict[str, Any]:
24
- responses = []
25
- difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
26
- total_count = 0
27
- true = 0
28
-
29
- for row in self.dataset:
30
- total_count += 2
31
-
32
- # Get values from row
33
-
34
- ## common
35
- choices = ast.literal_eval(row["choices"]) # Convert string to list
36
- formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
37
-
38
- ## ambiguous context
39
- ambiguous_context = row["ambiguous_context"]
40
- ambiguous_question = row["question_ambiguous"]
41
- ambiguous_answer = row["answer_ambiguous"]
42
- ambiguous_correct_answer_letter = chr(64 + ambiguous_answer) # 65 - 1 since we need the indexing to start from 0
43
- ambiguous_prompt = f"Bağlam: {ambiguous_context}\nSoru: {ambiguous_question}\nSeçenekler:{formatted_choices}"
44
-
45
- ## disambiguated context
46
- disambiguated_context = row["disambiguated_context"]
47
- disambiguated_question = row["question_disambiguated"]
48
- disambiguated_answer = row["answer_disambiguated"]
49
- disambiguated_correct_answer_letter = chr(64 + disambiguated_answer) # 65 - 1 since we need the indexing to start from 0
50
- disambiguated_prompt = f"Bağlam: {disambiguated_context}\nSoru: {disambiguated_question}\nSeçenekler:{formatted_choices}"
51
-
52
- # Prints for debugging
53
- print(f"Choices: {choices}")
54
- print("Type of choices:", type(choices))
55
-
56
- # STAGE 1
57
- instruction = ""
58
- prompt = f"Stage1:\n{ambiguous_prompt}\n{instruction}\n"
59
- message = prompt
60
- ambiguous_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
61
- ambiguous_model_answer_cleaned = ambiguous_model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
62
- ## Print answers
63
- print(f"Correct Answer: {ambiguous_correct_answer_letter}")
64
- print(f"Model Answer: {ambiguous_model_answer}")
65
- print(f"Model Answer Cleaned: {ambiguous_model_answer_cleaned}")
66
- print(f"Result: {ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned}")
67
- ## Check if correct based on metric
68
- if ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned:
69
- true += 1
70
- difficulty_results["ambiguous"]['correct'] += 1
71
-
72
- difficulty_results["ambiguous"]['total'] += 1
73
-
74
- # STAGE 2
75
- instruction = ""
76
- prompt = f"Stage2:\n{disambiguated_prompt}\n{instruction}\n"
77
- message = prompt
78
- disambiguated_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
79
- disambiguated_model_answer_cleaned = disambiguated_model_answer.strip().replace('\n', '').replace(' ','').upper().replace(':', '')
80
- ## Print answers
81
- print(f"Correct Answer: {disambiguated_correct_answer_letter}")
82
- print(f"Model Answer: {disambiguated_model_answer}")
83
- print(f"Model Answer Cleaned: {disambiguated_model_answer_cleaned}")
84
- print(f"Result: {disambiguated_correct_answer_letter == disambiguated_model_answer_cleaned}")
85
- responses.append((ambiguous_model_answer_cleaned,disambiguated_model_answer_cleaned))
86
-
87
- ## Check if correct based on metric
88
- if disambiguated_correct_answer_letter == disambiguated_model_answer_cleaned:
89
- true += 1
90
- difficulty_results["disambiguated"]['correct'] += 1
91
-
92
- difficulty_results["disambiguated"]['total'] += 1
93
-
94
- # Print results categorized by difficulty
95
- for category, stats in difficulty_results.items():
96
- correct = stats['correct']
97
- total = stats['total']
98
- calculatedAccuracy = correct / total if total > 0 else 0
99
- print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({correct}/{total})")
100
-
101
- print("Results:", responses)
102
- print("Overall Accuracy:", true / total_count)
103
- acc = accuracy(true, total_count)
104
- acc_stderr = accuracy_standard_error(acc, total_count)
105
- return {"acc": acc, "acc_stderr": acc_stderr}
106
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/deepeval/bias_task.py CHANGED
@@ -9,8 +9,8 @@ class BiasTask(BaseTask):
9
  def __init__(self, model_name: str):
10
  super().__init__("metunlp/sosyoloji_bias", model_name=model_name)
11
 
12
- def load_dataset_from_hf(self):
13
- dataset = super().load_dataset_from_hf()
14
  return dataset
15
 
16
  def evaluate(self) -> dict[str, Any]:
 
9
  def __init__(self, model_name: str):
10
  super().__init__("metunlp/sosyoloji_bias", model_name=model_name)
11
 
12
+ def load_dataset_lmjudge_from_hf(self):
13
+ dataset = super().load_dataset_lmjudge_from_hf()
14
  return dataset
15
 
16
  def evaluate(self) -> dict[str, Any]: