aacengiz commited on
Commit
08e0623
·
1 Parent(s): df15125

Add remaining datasets except pos and ner

Browse files
src/deepeval/base_task.py CHANGED
@@ -77,52 +77,6 @@ class BaseTask(ABC):
77
 
78
  return answer
79
 
80
- def generate_response_oeqa_multi_token(self, msg, max_new_tokens=-1, choices: list = []):
81
- """
82
- Handles multiple-choice questions where answers might have multiple tokens.
83
- """
84
- # Ensure tokenizer has proper special tokens set
85
- if self.tokenizer.pad_token is None:
86
- self.tokenizer.pad_token = self.tokenizer.eos_token
87
-
88
- if self.model.config.pad_token_id is None:
89
- self.model.config.pad_token_id = self.tokenizer.pad_token_id
90
-
91
- chat = [
92
- {"role": "user", "content": "You are a question-answering chatbot."},
93
- {"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
94
- {"role": "user", "content": f"{msg}"},
95
- ]
96
- formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
97
- print(formatted_chat)
98
- inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
99
- input_ids = inputs.input_ids.to(self.model.device)
100
- attention_mask = inputs.attention_mask.to(self.model.device)
101
-
102
- # Custom LogitsProcessor to restrict generation
103
- class RestrictToABCDLogitsProcessor(LogitsProcessor):
104
- def __call__(self, input_ids, scores):
105
- mask = torch.full_like(scores, float("-inf")) # Block all tokens
106
- return mask
107
- logits_processor = LogitsProcessorList([RestrictToABCDLogitsProcessor()])
108
-
109
- # Generate response
110
- output = self.model.generate(
111
- input_ids,
112
- do_sample=True,
113
- attention_mask=attention_mask,
114
- #max_new_tokens=max_new_tokens,
115
- eos_token_id=self.tokenizer.eos_token_id,
116
- pad_token_id=self.tokenizer.pad_token_id,
117
- temperature=0.4,
118
- #logits_processor=logits_processor,
119
- )
120
- generated_ids = output[0] # The generated sequence including the prompt
121
- generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
122
- generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
123
- return generated_text
124
-
125
-
126
  def generate_response_mcqa_multi_token(self, msg, max_new_tokens=5, choices: list = []):
127
  """
128
  Handles multiple-choice questions where answers might have multiple tokens.
 
77
 
78
  return answer
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  def generate_response_mcqa_multi_token(self, msg, max_new_tokens=5, choices: list = []):
81
  """
82
  Handles multiple-choice questions where answers might have multiple tokens.
src/deepeval/bias.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import ast
6
+
7
+
8
+ class BiasTask(BaseTask):
9
+ def __init__(self, model_name):
10
+ super().__init__("metunlp/sosyoloji_bias", model_name=model_name)
11
+
12
+ def load_dataset_from_hf(self):
13
+ dataset = super().load_dataset_from_hf()
14
+ return dataset.select(range(min(10, len(dataset))))
15
+
16
+
17
+ def evaluate(self) -> dict[str, Any]:
18
+ responses = []
19
+ difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
20
+ total_count = 0
21
+ true = 0
22
+
23
+ for row in self.dataset:
24
+ total_count += 2
25
+
26
+ # Get values from row
27
+
28
+ ## common
29
+ choices = ast.literal_eval(row["choices"]) # Convert string to list
30
+ formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
31
+
32
+ ## ambiguous context
33
+ ambiguous_context = row["ambiguous_context"]
34
+ ambiguous_question = row["question_ambiguous"]
35
+ ambiguous_answer = row["answer_ambiguous"]
36
+ ambiguous_correct_answer_letter = chr(65 + ambiguous_answer)
37
+ ambiguous_prompt = f"Bağlam: {ambiguous_context}\nSoru: {ambiguous_question}\nSeçenekler:{formatted_choices}"
38
+
39
+ ## disambiguated context
40
+ disambiguated_context = row["disambiguated_context"]
41
+ disambiguated_question = row["question_disambiguated"]
42
+ disambiguated_answer = row["answer_disambiguated"]
43
+ disambiguated_correct_answer_letter = chr(65 + disambiguated_answer)
44
+ disambiguated_prompt = f"Bağlam: {disambiguated_context}\nSoru: {disambiguated_question}\nSeçenekler:{formatted_choices}"
45
+
46
+ # Prints for debugging
47
+ print(f"Choices: {choices}")
48
+ print("Type of choices:", type(choices))
49
+
50
+ # STAGE 1
51
+ instruction = ""
52
+ prompt = f"Stage1:\n{ambiguous_prompt}\n{instruction}\n"
53
+ message = prompt
54
+ ambiguous_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
55
+ ambiguous_model_answer_cleaned = ambiguous_model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
56
+ ## Print answers
57
+ print(f"Correct Answer: {ambiguous_correct_answer_letter}")
58
+ print(f"Model Answer: {ambiguous_model_answer}")
59
+ print(f"Model Answer Cleaned: {ambiguous_model_answer_cleaned}")
60
+ print(f"Result: {ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned}")
61
+ ## Check if correct based on metric
62
+ if ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned:
63
+ true += 1
64
+ difficulty_results['correct'] += 1
65
+
66
+ difficulty_results['total'] += 1
67
+
68
+ # STAGE 2
69
+ instruction = ""
70
+ prompt = f"Stage2:\n{disambiguated_prompt}\n{instruction}\n"
71
+ message = prompt
72
+ disambiguated_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
73
+ disambiguated_model_answer_cleaned = disambiguated_model_answer.strip().replace('\n', '').replace(' ','').upper().replace(':', '')
74
+ ## Print answers
75
+ print(f"Correct Answer: {disambiguated_correct_answer_letter}")
76
+ print(f"Model Answer: {disambiguated_model_answer}")
77
+ print(f"Model Answer Cleaned: {disambiguated_model_answer_cleaned}")
78
+ print(f"Result: {disambiguated_correct_answer_letter == disambiguated_model_answer_cleaned}")
79
+ responses.append((ambiguous_model_answer_cleaned,disambiguated_model_answer_cleaned))
80
+
81
+ ## Check if correct based on metric
82
+ if disambiguated_correct_answer_letter == disambiguated_model_answer_cleaned:
83
+ true += 1
84
+ difficulty_results['correct'] += 1
85
+
86
+ difficulty_results['total'] += 1
87
+
88
+ # Print results categorized by difficulty
89
+ for category, stats in difficulty_results.items():
90
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
91
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
92
+
93
+ print("Results:", responses)
94
+ print("Overall Accuracy:", true / total_count)
95
+ acc = accuracy(true, total_count)
96
+ acc_stderr = accuracy_standard_error(acc, total_count)
97
+ return {"acc": acc, "acc_stderr": acc_stderr}
98
+
src/deepeval/deepeval_task_manager.py CHANGED
@@ -15,6 +15,12 @@ from src.deepeval.complex_reasoning import ComplexReasoningTask
15
  from src.deepeval.truthfulness_task import TruthfulnessTask
16
  from src.deepeval.nli import NLITask
17
  from src.deepeval.math import MathTask
 
 
 
 
 
 
18
  from typing import List
19
  load_dotenv()
20
  HF_TOKEN=os.getenv("HF_TOKEN")
@@ -35,6 +41,12 @@ class Task(Enum):
35
  TRUTHFULNESS = "sosyoloji_truthfulness"
36
  NLI = "nli"
37
  MATH = "math"
 
 
 
 
 
 
38
 
39
 
40
  class DeepEvalTaskManager:
@@ -139,7 +151,37 @@ class DeepEvalTaskManager:
139
  res = math_task.evaluate()
140
  return res
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  if __name__ == "__main__":
143
- des = DeepEvalTaskManager("google/gemma-2-2b-it", ["SUMMARIZATION"])
144
  res = des.run_tasks()
145
  print(res)
 
15
  from src.deepeval.truthfulness_task import TruthfulnessTask
16
  from src.deepeval.nli import NLITask
17
  from src.deepeval.math import MathTask
18
+ from src.deepeval.turkish_vocabulary import TurkishVocabularyTask
19
+ from src.deepeval.metaphors_and_idioms import MetaphorsAndIdiomsTask
20
+ from src.deepeval.topic_detection import TopicDetectionTask
21
+ from src.deepeval.sts import STSTask
22
+ from src.deepeval.mmlu import MMLUTask
23
+ from src.deepeval.bias import BiasTask
24
  from typing import List
25
  load_dotenv()
26
  HF_TOKEN=os.getenv("HF_TOKEN")
 
41
  TRUTHFULNESS = "sosyoloji_truthfulness"
42
  NLI = "nli"
43
  MATH = "math"
44
+ TURKISH_VOCABULARY = "turkish_vocabulary"
45
+ METAPHORS_AND_IDIOMS = "metaphors_and_idioms"
46
+ TOPIC_DETECTION = "topic_detection"
47
+ STS = "sts"
48
+ MMLU = "mmlu"
49
+ BIAS_MC = "bias"
50
 
51
 
52
  class DeepEvalTaskManager:
 
151
  res = math_task.evaluate()
152
  return res
153
 
154
+ def turkish_vocabulary(self):
155
+ turkish_vocabulary_task = TurkishVocabularyTask(self.model_name)
156
+ res = turkish_vocabulary_task.evaluate()
157
+ return res
158
+
159
+ def metaphors_and_idioms(self):
160
+ metaphors_and_idioms_task = MetaphorsAndIdiomsTask(self.model_name)
161
+ res = metaphors_and_idioms_task.evaluate()
162
+ return res
163
+
164
+ def topic_detection(self):
165
+ topic_detection_task = TopicDetectionTask(self.model_name)
166
+ res = topic_detection_task.evaluate()
167
+ return res
168
+
169
+ def sts(self):
170
+ sts_task = STSTask(self.model_name)
171
+ res = sts_task.evaluate()
172
+ return res
173
+
174
+ def mmlu(self):
175
+ mmlu_task = MMLUTask(self.model_name)
176
+ res = mmlu_task.evaluate()
177
+ return res
178
+
179
+ def bias(self):
180
+ bias_task = BiasTask(self.model_name)
181
+ res = bias_task.evaluate()
182
+ return res
183
+
184
  if __name__ == "__main__":
185
+ des = DeepEvalTaskManager("google/gemma-2b-it", ["MMLU"])
186
  res = des.run_tasks()
187
  print(res)
src/deepeval/math.py CHANGED
@@ -10,7 +10,47 @@ class MathTask(BaseTask):
10
 
11
  def load_dataset_from_hf(self):
12
  dataset = super().load_dataset_from_hf()
13
- return dataset.select(range(min(10, len(dataset))))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
  def evaluate(self) -> dict[str, Any]:
@@ -31,23 +71,29 @@ class MathTask(BaseTask):
31
  print("Type of answer:", type(answer))
32
 
33
  # Construct the prompt/message
34
- instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu içinde verin.
35
 
36
  Nihai Cevap için Uyulması Gereken Format Kuralları:
37
 
38
- 1. Nihai cevap, tek seferde \\boxed{{...}} içinde verilmeli. Örnek: Cevap 1 ise, "\\boxed{{1}}".
39
- 2. Kesirleri her zaman en sade halde verilmeli.
40
  - Matris içi kesirler: x/y biçiminde.
41
  - Diğer tüm kesirler: \\frac{{x}}{{y}} biçiminde.
42
- 3. Çarpma işareti (*) kullanmayın. Örnek: 2x yazın, 2**x* değil.
43
- 4. Birden çok değişken varsa alfabetik sıraya uyulmalı ve (x, y, z...), polinomları azalan derece sırasına göre yazılmalı.
44
- 5. Her zaman aynı gösterim biçimi kullanılmalı. Ondalık yerine kesir kullanılmalı (ör. 0.5 yerine \\frac{{1}}{{2}} ).
45
- 6. Faktörize polinomlar daima aynı faktör sırası ile verilsin; her sorguda aynı cevabı verecek şekilde tutarlılığı koruyun.
46
- 7. Nihai cevabı kutu dışında tekrar etmeyin, biçimi değiştirmeyin. Aynı soru tekrarlandığında aynı formatı ve cevabı verin.
 
 
47
 
48
  Görev: Problemi çözün, son adımda yukarıdaki kurallara tam uyan tek bir kutu içinde nihai cevabı verin.
49
 
50
- """
 
 
 
 
 
51
  prompt = f"{instruction}\n\nSoru:\n{row["question"]}\n"
52
  message = prompt
53
 
 
10
 
11
  def load_dataset_from_hf(self):
12
  dataset = super().load_dataset_from_hf()
13
+ return dataset.select(range(min(1, len(dataset))))
14
+
15
+ def generate_response_oeqa_multi_token(self, msg,max_new_tokens: int = 128):
16
+ """
17
+ Handles multiple-choice questions where answers might have multiple tokens.
18
+ """
19
+ # Ensure tokenizer has proper special tokens set
20
+ if self.tokenizer.pad_token is None:
21
+ self.tokenizer.pad_token = self.tokenizer.eos_token
22
+
23
+ if self.model.config.pad_token_id is None:
24
+ self.model.config.pad_token_id = self.tokenizer.pad_token_id
25
+
26
+ chat = [
27
+ {"role": "user", "content": "You are a question-answering chatbot."},
28
+ {"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
29
+ {"role": "user", "content": f"{msg}"},
30
+ ]
31
+ formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
32
+ print(formatted_chat)
33
+
34
+ inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
35
+ input_ids = inputs.input_ids.to(self.model.device)
36
+ attention_mask = inputs.attention_mask.to(self.model.device)
37
+
38
+ # Generate response with proper token limits
39
+ output = self.model.generate(
40
+ input_ids,
41
+ do_sample=True,
42
+ attention_mask=attention_mask,
43
+ eos_token_id=self.tokenizer.eos_token_id,
44
+ pad_token_id=self.tokenizer.pad_token_id,
45
+ temperature=0.4,
46
+ max_new_tokens=max_new_tokens,
47
+ )
48
+
49
+ generated_ids = output[0] # The generated sequence including the prompt
50
+ generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
51
+ generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
52
+
53
+ return generated_text
54
 
55
 
56
  def evaluate(self) -> dict[str, Any]:
 
71
  print("Type of answer:", type(answer))
72
 
73
  # Construct the prompt/message
74
+ instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu (\\boxed{{}}) içinde verin.
75
 
76
  Nihai Cevap için Uyulması Gereken Format Kuralları:
77
 
78
+ 1. Kesirler her zaman en sade hallerinde verilmeli.
 
79
  - Matris içi kesirler: x/y biçiminde.
80
  - Diğer tüm kesirler: \\frac{{x}}{{y}} biçiminde.
81
+ 2. Çarpma işareti (*) kullanılmamalı. Örnek: 2x yazın, 2**x* değil.
82
+ 3. Birden çok değişken varsa alfabetik sıraya uyulmalı ve (x, y, z...), polinomları azalan derece sırasına göre yazılmalı.
83
+ 4. Her zaman aynı gösterim biçimi kullanılmalı. Ondalık yerine kesir kullanılmalı (ör. 0.5 yerine \\frac{{1}}{{2}} ).
84
+ 5. Faktörize polinomlar daima aynı faktör sırası ile verilsin; her sorguda aynı cevabı verecek şekilde tutarlılığı koruyun.
85
+ 6. Nihai cevabı kutu dışında tekrar etmeyin, biçimi değiştirmeyin. Aynı soru tekrarlandığında aynı formatı ve cevabı verin.
86
+ 7. Nihai cevap, tek seferde \\boxed{{...}} içinde verilmeli. Örnek: Cevap x ise, "\\boxed{{x}}".
87
+
88
 
89
  Görev: Problemi çözün, son adımda yukarıdaki kurallara tam uyan tek bir kutu içinde nihai cevabı verin.
90
 
91
+
92
+ Çözüm:
93
+
94
+
95
+ Nihai cevap:
96
+ """
97
  prompt = f"{instruction}\n\nSoru:\n{row["question"]}\n"
98
  message = prompt
99
 
src/deepeval/metaphors_and_idioms.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import os
6
+ import ast
7
+ import re
8
+ from datasets import load_dataset,get_dataset_split_names
9
+ HF_TOKEN=os.getenv("HF_TOKEN")
10
+
11
+ class MetaphorsAndIdiomsTask(BaseTask):
12
+ def __init__(self, model_name):
13
+ super().__init__("metunlp/metaphors_and_idioms", model_name=model_name)
14
+
15
+ def load_dataset_from_hf(self):
16
+ dataset = super().load_dataset_from_hf()
17
+ return dataset # dataset.select(range(min(10, len(dataset))))
18
+
19
+ def evaluate(self) -> dict[str, Any]:
20
+ responses = []
21
+ difficulty_results = defaultdict(lambda: defaultdict(lambda: {'correct': 0, 'total': 0}))
22
+
23
+ total_count = 0
24
+ true = 0
25
+
26
+
27
+ for row in self.dataset:
28
+ total_count += 1
29
+
30
+ # Get values from row
31
+ category = "hard" if row["level"]== 1 else "easy" if row["level"] == 0 else None
32
+ answer_index = row["answer"]
33
+ correct_answer_letter = chr(65 + answer_index)
34
+ context = row["context"]
35
+ choices = ast.literal_eval(row["choices"]) # Convert string to list
36
+ formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
37
+ subset = row["idiom_type"]
38
+
39
+ if subset == "atasözü":
40
+ question = "Aşağıda verilen durum hangi atasözü ile en iyi ifade edilebilir?"
41
+ elif subset == "deyim":
42
+ question = """Verilen bağlamda "[MASKED]" ile boş bırakılan yere hangi deyim getirilirse cümlenin akışı anlamlı olur?"""
43
+ else:
44
+ question = "Aşağıda verilen durum hangi atasözü ile en iyi ifade edilebilir?"
45
+
46
+ # Prints for debugging
47
+ print(f"Difficulty: {category}")
48
+ print("Type of difficulty:", type(category))
49
+ print(f"Answer: {correct_answer_letter}")
50
+ print("Type of answer:", type(answer_index))
51
+
52
+ # Construct the prompt/message
53
+ instruction = ""
54
+ prompt = f"Soru: {question}\nBağlam: {context}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
55
+ message = prompt
56
+
57
+ # Get/format answer of the model
58
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
59
+ responses.append(model_answer)
60
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
61
+
62
+ # Print answers
63
+ print(f"Correct Answer: {correct_answer_letter}")
64
+ print(f"Model Answer: {model_answer}")
65
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
66
+ print(f"Result: {correct_answer_letter == model_answer_cleaned}")
67
+
68
+ # Check if correct based on metric
69
+ if correct_answer_letter == model_answer_cleaned:
70
+ true += 1
71
+ difficulty_results[subset][category]['correct'] += 1
72
+
73
+ difficulty_results[subset][category]['total'] += 1
74
+
75
+ # Print results categorized by difficulty
76
+ for subset in difficulty_results.keys():
77
+ subset_results = difficulty_results[subset]
78
+ for category, stats in subset_results.items():
79
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
80
+ print(f"{subset.capitalize()} {category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
81
+
82
+ print("Results:", responses)
83
+ print("Overall Accuracy:", true / total_count)
84
+ acc = accuracy(true, total_count)
85
+ acc_stderr = accuracy_standard_error(acc, total_count)
86
+ return {"acc": acc, "acc_stderr": acc_stderr}
87
+
src/deepeval/mmlu.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import os
6
+ import ast
7
+ import re
8
+ from datasets import load_dataset,get_dataset_config_names
9
+ HF_TOKEN=os.getenv("HF_TOKEN")
10
+
11
+ class MMLUTask(BaseTask):
12
+ def __init__(self, model_name):
13
+ self.subsets = get_dataset_config_names("metunlp/mmlu_tr")
14
+ print(self.subsets)
15
+ super().__init__("metunlp/mmlu_tr", model_name=model_name)
16
+
17
+ def load_dataset_from_hf(self):
18
+ evaluate_count = 1
19
+ print("Loading dataset from Hugging Face.")
20
+ dataset_dict = {}
21
+ for subset in self.subsets:
22
+ subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
23
+ dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
24
+ print("Dataset loaded.")
25
+ return dataset_dict
26
+
27
+
28
+ def evaluate(self) -> dict[str, Any]:
29
+ responses = []
30
+ difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
31
+
32
+ total_count = 0
33
+ true = 0
34
+
35
+ for subset in self.subsets:
36
+ curr_dataset = self.dataset[subset]
37
+ print(curr_dataset[0])
38
+
39
+ for row in curr_dataset:
40
+ total_count += 1
41
+
42
+ # Get values from row
43
+ question = row["question"]
44
+ answer_index = row["answer"]
45
+ correct_answer_letter = chr(65 + answer_index)
46
+ choices = ast.literal_eval(row["choices"]) # Convert string to list
47
+ formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
48
+
49
+
50
+ # Prints for debugging
51
+ print(f"Answer: {correct_answer_letter}")
52
+ print("Type of answer:", type(answer_index))
53
+
54
+ # Construct the prompt/message
55
+ instruction = f"Aşağıda {row["subject"]} konusunda çoktan seçmeli bir soru verilmiştir."
56
+ prompt = f"{instruction}\n\nSoru: {question}\nSeçenekler:\n{formatted_choices}\n\n"
57
+ message = prompt
58
+
59
+ # Get/format answer of the model
60
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
61
+ responses.append(model_answer)
62
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
63
+
64
+ # Print answers
65
+ print(f"Correct Answer: {correct_answer_letter}")
66
+ print(f"Model Answer: {model_answer}")
67
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
68
+ print(f"Result: {correct_answer_letter == model_answer_cleaned}")
69
+
70
+ # Check if correct based on metric
71
+ if correct_answer_letter == model_answer_cleaned:
72
+ true += 1
73
+ difficulty_results[subset]['correct'] += 1
74
+
75
+ difficulty_results[subset]['total'] += 1
76
+
77
+ # Print results categorized by subset
78
+ for category, stats in difficulty_results.items():
79
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
80
+ print(f"{subset.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
81
+
82
+ print("Results:", responses)
83
+ print("Overall Accuracy:", true / total_count)
84
+ acc = accuracy(true, total_count)
85
+ acc_stderr = accuracy_standard_error(acc, total_count)
86
+ return {"acc": acc, "acc_stderr": acc_stderr}
87
+
src/deepeval/sts.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import re
6
+ from datasets import load_dataset
7
+ import os
8
+ from dotenv import load_dotenv
9
+ import openai
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList, LogitsProcessor
11
+ import torch
12
+ from typing import List
13
+
14
+ class STSTask(BaseTask):
15
+ def __init__(self, model_name):
16
+ super().__init__("metunlp/sts_tr", model_name=model_name)
17
+
18
+ def load_dataset_from_hf(self):
19
+ dataset = super().load_dataset_from_hf()
20
+ return dataset.select(range(min(1, len(dataset))))
21
+
22
+ def generate_response_sts_multi_token(self, msg, max_new_tokens=5, choices: list = []):
23
+ """
24
+ Handles multiple-choice questions where answers might have multiple tokens.
25
+ """
26
+ # Ensure tokenizer has proper special tokens set
27
+ if self.tokenizer.pad_token is None:
28
+ self.tokenizer.pad_token = self.tokenizer.eos_token
29
+
30
+ if self.model.config.pad_token_id is None:
31
+ self.model.config.pad_token_id = self.tokenizer.pad_token_id
32
+
33
+ chat = [
34
+ {"role": "user",
35
+ "content": "You are a sentence similarity scoring chatbot. Only respond with one of the given scores: 0, 1, 2, 3, 4, or 5."},
36
+ {"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
37
+ {"role": "user", "content": f"{msg}"},
38
+ ]
39
+ formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
40
+ print(formatted_chat)
41
+ inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
42
+ input_ids = inputs.input_ids.to(self.model.device)
43
+ attention_mask = inputs.attention_mask.to(self.model.device)
44
+
45
+ # Generate the sequence of letters starting from 'A'
46
+ letters = ["0","1","2","3","4","5"]
47
+ encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
48
+ flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list
49
+ print(flattened_encoded_choices)
50
+
51
+ allowed_tokens = flattened_encoded_choices
52
+ allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
53
+ allowed_token_ids = set(allowed_tokens) # Ensure uniqueness
54
+
55
+ # Custom LogitsProcessor to restrict generation
56
+ class RestrictToABCDLogitsProcessor(LogitsProcessor):
57
+ def __call__(self, input_ids, scores):
58
+ mask = torch.full_like(scores, float("-inf")) # Block all tokens
59
+ mask[:, list(allowed_token_ids)] = scores[:, list(allowed_token_ids)] # Allow only A, B, C, D tokens
60
+ return mask
61
+
62
+ logits_processor = LogitsProcessorList([RestrictToABCDLogitsProcessor()])
63
+
64
+ # Generate response
65
+ output = self.model.generate(
66
+ input_ids,
67
+ do_sample=True,
68
+ attention_mask=attention_mask,
69
+ max_new_tokens=max_new_tokens,
70
+ eos_token_id=self.tokenizer.eos_token_id,
71
+ pad_token_id=self.tokenizer.pad_token_id,
72
+ temperature=0.4,
73
+ logits_processor=logits_processor,
74
+ )
75
+ generated_ids = output[0] # The generated sequence including the prompt
76
+ generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
77
+ generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
78
+ return generated_text
79
+
80
+ def evaluate(self) -> dict[str, Any]:
81
+ responses = []
82
+ difficulty_results = {'correct': 0, 'total': 0}
83
+
84
+ total_count = 0
85
+ true = 0
86
+
87
+ for row in self.dataset:
88
+ total_count += 1
89
+
90
+ # Get values from row
91
+ answer = row["score"]
92
+ choices = ["0","1","2","3","4","5"]
93
+
94
+ # Prints for debugging
95
+ print(f"Answer: {answer}")
96
+ print("Type of answer:", type(answer))
97
+
98
+ # Construct the prompt/message
99
+ instruction = f"Aşağıda verilen iki cümlenin birbirlerine olan anlamsal benzerliğini 0'dan 5'e kadar olan bir tam sayıyla söyleyin."
100
+ prompt = f"""{instruction}\nCümle 1: {row["sentence_1"]}\nCümle 2: {row["sentence_2"]}\nSadece tek bir tam sayı söyleyin, ek bir kelime ya da sembol kullanmayın."""
101
+ message = prompt
102
+
103
+ # Get/format answer of the model
104
+ model_answer = self.generate_response_sts_multi_token(message, max_new_tokens=2)
105
+ responses.append(model_answer)
106
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
107
+
108
+ # Print answers
109
+ print(f"Correct Answer: {answer}")
110
+ print(f"Model Answer: {model_answer}")
111
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
112
+ print(f"Result: {answer == model_answer_cleaned}")
113
+
114
+ # Check if correct based on metric
115
+ if answer == model_answer_cleaned:
116
+ true += 1
117
+ difficulty_results['correct'] += 1
118
+
119
+ difficulty_results['total'] += 1
120
+
121
+ # Print results
122
+ stats = difficulty_results
123
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
124
+ print(f"Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
125
+
126
+ print("Results:", responses)
127
+ print("Overall Accuracy:", true / total_count)
128
+ acc = accuracy(true, total_count)
129
+ acc_stderr = accuracy_standard_error(acc, total_count)
130
+ return {"acc": acc, "acc_stderr": acc_stderr}
131
+
src/deepeval/topic_detection.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import ast
6
+
7
+
8
+ class TopicDetectionTask(BaseTask):
9
+ def __init__(self, model_name):
10
+ super().__init__("metunlp/topic_detection_tr", model_name=model_name)
11
+
12
+ def load_dataset_from_hf(self):
13
+ dataset = super().load_dataset_from_hf()
14
+ return dataset.select(range(min(10, len(dataset))))
15
+
16
+
17
+ def evaluate(self) -> dict[str, Any]:
18
+ responses = []
19
+ difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
20
+ total_count = 0
21
+ true = 0
22
+
23
+ for row in self.dataset:
24
+ total_count += 1
25
+
26
+ # Get values from row
27
+ choices = ast.literal_eval(row["choices"]) # Convert string to list
28
+ formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
29
+ category = row["level"].lower().replace(' ','')
30
+ answer = row["answer"]
31
+ text = row["text"]
32
+
33
+ # Prints for debugging
34
+ print(f"Choices: {choices}")
35
+ print("Type of choices:", type(choices))
36
+ print("Type of answer:", type(answer))
37
+
38
+ # Get answer index (starting from 0)
39
+ if type(answer) == int:
40
+ answer_index = answer
41
+ else:
42
+ answer_index = int(answer)
43
+ correct_answer_letter = chr(65 + answer_index)
44
+
45
+
46
+ # Construct the prompt/message
47
+ instruction = "Aşağıdaki metni analiz et ve seçeneklerden bu metnin en olası kategorisini belirle. Temaya ve detaylara dikkat ederek metnin ana fikrini göz önünde bulundurarak soruyu cevapla."
48
+ prompt = f"{instruction}\n\nMetin:\n{text}\nSeçenekler:\n{formatted_choices}\n\n"
49
+ message = prompt
50
+
51
+ # Get/format answer of the model
52
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
53
+ responses.append(model_answer)
54
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
55
+
56
+ # Print answers
57
+ print(f"Correct Answer: {correct_answer_letter}")
58
+ print(f"Model Answer: {model_answer}")
59
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
60
+ print(f"Result: {correct_answer_letter == model_answer_cleaned}")
61
+
62
+ # Check if correct based on metric
63
+ if correct_answer_letter == model_answer_cleaned:
64
+ true += 1
65
+ difficulty_results[category]['correct'] += 1
66
+
67
+ difficulty_results[category]['total'] += 1
68
+
69
+ # Print results categorized by difficulty
70
+ for category, stats in difficulty_results.items():
71
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
72
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
73
+
74
+ print("Results:", responses)
75
+ print("Overall Accuracy:", true / total_count)
76
+ acc = accuracy(true, total_count)
77
+ acc_stderr = accuracy_standard_error(acc, total_count)
78
+ return {"acc": acc, "acc_stderr": acc_stderr}
79
+
src/deepeval/turkish_vocabulary.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import os
6
+ import ast
7
+ import re
8
+ from datasets import load_dataset,get_dataset_split_names
9
+ HF_TOKEN=os.getenv("HF_TOKEN")
10
+
11
+ class TurkishVocabularyTask(BaseTask):
12
+ def __init__(self, model_name):
13
+ self.subsets = ["rare", "loan"]
14
+ super().__init__("metunlp/turkish_vocabulary", model_name=model_name)
15
+
16
+ def load_dataset_from_hf(self):
17
+ evaluate_count = 1
18
+ print("Loading dataset from Hugging Face.")
19
+ dataset_dict = {}
20
+ for subset in self.subsets:
21
+ subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
22
+ dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
23
+ print("Dataset loaded.")
24
+ return dataset_dict
25
+
26
+
27
+ def evaluate(self) -> dict[str, Any]:
28
+ responses = []
29
+ difficulty_results = defaultdict(lambda: defaultdict(lambda: {'correct': 0, 'total': 0}))
30
+
31
+ total_count = 0
32
+ true = 0
33
+
34
+ for subset in self.subsets:
35
+ curr_dataset = self.dataset[subset]
36
+ print(curr_dataset[0])
37
+
38
+ # Determine the question based on the subset
39
+ if subset == "rare":
40
+ question = "Verilen kelimenin eş anlamlısı aşağıdakilerden hangisidir?"
41
+ elif subset == "loan":
42
+ question = "Verilen kelimenin Türkçe kökenli eş anlamlısı aşağıdakilerden hangisidir?"
43
+ else:
44
+ question = "Verilen kelimenin eş anlamlısı aşağıdakilerden hangisidir?"
45
+
46
+ for row in curr_dataset:
47
+ total_count += 1
48
+
49
+ # Get values from row
50
+ category = "hard" if row["level"]== 1 else "easy" if row["level"] == 0 else None
51
+ answer_index = row["answer"]
52
+ correct_answer_letter = chr(65 + answer_index)
53
+ word = row["word"]
54
+ choices = ast.literal_eval(row["choices"]) # Convert string to list
55
+ formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
56
+
57
+
58
+
59
+ # Prints for debugging
60
+ print(f"Difficulty: {category}")
61
+ print("Type of difficulty:", type(category))
62
+ print(f"Answer: {correct_answer_letter}")
63
+ print("Type of answer:", type(answer_index))
64
+
65
+ # Construct the prompt/message
66
+ instruction = ""
67
+ prompt = f"Soru: {question}\nKelime: {word}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
68
+ message = prompt
69
+
70
+ # Get/format answer of the model
71
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
72
+ responses.append(model_answer)
73
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
74
+
75
+ # Print answers
76
+ print(f"Correct Answer: {correct_answer_letter}")
77
+ print(f"Model Answer: {model_answer}")
78
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
79
+ print(f"Result: {correct_answer_letter == model_answer_cleaned}")
80
+
81
+ # Check if correct based on metric
82
+ if correct_answer_letter == model_answer_cleaned:
83
+ true += 1
84
+ difficulty_results[subset][category]['correct'] += 1
85
+
86
+ difficulty_results[subset][category]['total'] += 1
87
+
88
+ # Print results categorized by difficulty
89
+ for subset in self.subsets:
90
+ subset_results = difficulty_results[subset]
91
+ for category, stats in subset_results.items():
92
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
93
+ print(f"{subset.capitalize()} {category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
94
+
95
+ print("Results:", responses)
96
+ print("Overall Accuracy:", true / total_count)
97
+ acc = accuracy(true, total_count)
98
+ acc_stderr = accuracy_standard_error(acc, total_count)
99
+ return {"acc": acc, "acc_stderr": acc_stderr}
100
+