Spaces:
Paused
Paused
Add remaining datasets except pos and ner
Browse files- src/deepeval/base_task.py +0 -46
- src/deepeval/bias.py +98 -0
- src/deepeval/deepeval_task_manager.py +43 -1
- src/deepeval/math.py +56 -10
- src/deepeval/metaphors_and_idioms.py +87 -0
- src/deepeval/mmlu.py +87 -0
- src/deepeval/sts.py +131 -0
- src/deepeval/topic_detection.py +79 -0
- src/deepeval/turkish_vocabulary.py +100 -0
src/deepeval/base_task.py
CHANGED
@@ -77,52 +77,6 @@ class BaseTask(ABC):
|
|
77 |
|
78 |
return answer
|
79 |
|
80 |
-
def generate_response_oeqa_multi_token(self, msg, max_new_tokens=-1, choices: list = []):
|
81 |
-
"""
|
82 |
-
Handles multiple-choice questions where answers might have multiple tokens.
|
83 |
-
"""
|
84 |
-
# Ensure tokenizer has proper special tokens set
|
85 |
-
if self.tokenizer.pad_token is None:
|
86 |
-
self.tokenizer.pad_token = self.tokenizer.eos_token
|
87 |
-
|
88 |
-
if self.model.config.pad_token_id is None:
|
89 |
-
self.model.config.pad_token_id = self.tokenizer.pad_token_id
|
90 |
-
|
91 |
-
chat = [
|
92 |
-
{"role": "user", "content": "You are a question-answering chatbot."},
|
93 |
-
{"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
|
94 |
-
{"role": "user", "content": f"{msg}"},
|
95 |
-
]
|
96 |
-
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
97 |
-
print(formatted_chat)
|
98 |
-
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
99 |
-
input_ids = inputs.input_ids.to(self.model.device)
|
100 |
-
attention_mask = inputs.attention_mask.to(self.model.device)
|
101 |
-
|
102 |
-
# Custom LogitsProcessor to restrict generation
|
103 |
-
class RestrictToABCDLogitsProcessor(LogitsProcessor):
|
104 |
-
def __call__(self, input_ids, scores):
|
105 |
-
mask = torch.full_like(scores, float("-inf")) # Block all tokens
|
106 |
-
return mask
|
107 |
-
logits_processor = LogitsProcessorList([RestrictToABCDLogitsProcessor()])
|
108 |
-
|
109 |
-
# Generate response
|
110 |
-
output = self.model.generate(
|
111 |
-
input_ids,
|
112 |
-
do_sample=True,
|
113 |
-
attention_mask=attention_mask,
|
114 |
-
#max_new_tokens=max_new_tokens,
|
115 |
-
eos_token_id=self.tokenizer.eos_token_id,
|
116 |
-
pad_token_id=self.tokenizer.pad_token_id,
|
117 |
-
temperature=0.4,
|
118 |
-
#logits_processor=logits_processor,
|
119 |
-
)
|
120 |
-
generated_ids = output[0] # The generated sequence including the prompt
|
121 |
-
generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
|
122 |
-
generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
123 |
-
return generated_text
|
124 |
-
|
125 |
-
|
126 |
def generate_response_mcqa_multi_token(self, msg, max_new_tokens=5, choices: list = []):
|
127 |
"""
|
128 |
Handles multiple-choice questions where answers might have multiple tokens.
|
|
|
77 |
|
78 |
return answer
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
def generate_response_mcqa_multi_token(self, msg, max_new_tokens=5, choices: list = []):
|
81 |
"""
|
82 |
Handles multiple-choice questions where answers might have multiple tokens.
|
src/deepeval/bias.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import ast
|
6 |
+
|
7 |
+
|
8 |
+
class BiasTask(BaseTask):
|
9 |
+
def __init__(self, model_name):
|
10 |
+
super().__init__("metunlp/sosyoloji_bias", model_name=model_name)
|
11 |
+
|
12 |
+
def load_dataset_from_hf(self):
|
13 |
+
dataset = super().load_dataset_from_hf()
|
14 |
+
return dataset.select(range(min(10, len(dataset))))
|
15 |
+
|
16 |
+
|
17 |
+
def evaluate(self) -> dict[str, Any]:
|
18 |
+
responses = []
|
19 |
+
difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
|
20 |
+
total_count = 0
|
21 |
+
true = 0
|
22 |
+
|
23 |
+
for row in self.dataset:
|
24 |
+
total_count += 2
|
25 |
+
|
26 |
+
# Get values from row
|
27 |
+
|
28 |
+
## common
|
29 |
+
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
30 |
+
formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
|
31 |
+
|
32 |
+
## ambiguous context
|
33 |
+
ambiguous_context = row["ambiguous_context"]
|
34 |
+
ambiguous_question = row["question_ambiguous"]
|
35 |
+
ambiguous_answer = row["answer_ambiguous"]
|
36 |
+
ambiguous_correct_answer_letter = chr(65 + ambiguous_answer)
|
37 |
+
ambiguous_prompt = f"Bağlam: {ambiguous_context}\nSoru: {ambiguous_question}\nSeçenekler:{formatted_choices}"
|
38 |
+
|
39 |
+
## disambiguated context
|
40 |
+
disambiguated_context = row["disambiguated_context"]
|
41 |
+
disambiguated_question = row["question_disambiguated"]
|
42 |
+
disambiguated_answer = row["answer_disambiguated"]
|
43 |
+
disambiguated_correct_answer_letter = chr(65 + disambiguated_answer)
|
44 |
+
disambiguated_prompt = f"Bağlam: {disambiguated_context}\nSoru: {disambiguated_question}\nSeçenekler:{formatted_choices}"
|
45 |
+
|
46 |
+
# Prints for debugging
|
47 |
+
print(f"Choices: {choices}")
|
48 |
+
print("Type of choices:", type(choices))
|
49 |
+
|
50 |
+
# STAGE 1
|
51 |
+
instruction = ""
|
52 |
+
prompt = f"Stage1:\n{ambiguous_prompt}\n{instruction}\n"
|
53 |
+
message = prompt
|
54 |
+
ambiguous_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
55 |
+
ambiguous_model_answer_cleaned = ambiguous_model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
56 |
+
## Print answers
|
57 |
+
print(f"Correct Answer: {ambiguous_correct_answer_letter}")
|
58 |
+
print(f"Model Answer: {ambiguous_model_answer}")
|
59 |
+
print(f"Model Answer Cleaned: {ambiguous_model_answer_cleaned}")
|
60 |
+
print(f"Result: {ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned}")
|
61 |
+
## Check if correct based on metric
|
62 |
+
if ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned:
|
63 |
+
true += 1
|
64 |
+
difficulty_results['correct'] += 1
|
65 |
+
|
66 |
+
difficulty_results['total'] += 1
|
67 |
+
|
68 |
+
# STAGE 2
|
69 |
+
instruction = ""
|
70 |
+
prompt = f"Stage2:\n{disambiguated_prompt}\n{instruction}\n"
|
71 |
+
message = prompt
|
72 |
+
disambiguated_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
73 |
+
disambiguated_model_answer_cleaned = disambiguated_model_answer.strip().replace('\n', '').replace(' ','').upper().replace(':', '')
|
74 |
+
## Print answers
|
75 |
+
print(f"Correct Answer: {disambiguated_correct_answer_letter}")
|
76 |
+
print(f"Model Answer: {disambiguated_model_answer}")
|
77 |
+
print(f"Model Answer Cleaned: {disambiguated_model_answer_cleaned}")
|
78 |
+
print(f"Result: {disambiguated_correct_answer_letter == disambiguated_model_answer_cleaned}")
|
79 |
+
responses.append((ambiguous_model_answer_cleaned,disambiguated_model_answer_cleaned))
|
80 |
+
|
81 |
+
## Check if correct based on metric
|
82 |
+
if disambiguated_correct_answer_letter == disambiguated_model_answer_cleaned:
|
83 |
+
true += 1
|
84 |
+
difficulty_results['correct'] += 1
|
85 |
+
|
86 |
+
difficulty_results['total'] += 1
|
87 |
+
|
88 |
+
# Print results categorized by difficulty
|
89 |
+
for category, stats in difficulty_results.items():
|
90 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
91 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
92 |
+
|
93 |
+
print("Results:", responses)
|
94 |
+
print("Overall Accuracy:", true / total_count)
|
95 |
+
acc = accuracy(true, total_count)
|
96 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
97 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
98 |
+
|
src/deepeval/deepeval_task_manager.py
CHANGED
@@ -15,6 +15,12 @@ from src.deepeval.complex_reasoning import ComplexReasoningTask
|
|
15 |
from src.deepeval.truthfulness_task import TruthfulnessTask
|
16 |
from src.deepeval.nli import NLITask
|
17 |
from src.deepeval.math import MathTask
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
from typing import List
|
19 |
load_dotenv()
|
20 |
HF_TOKEN=os.getenv("HF_TOKEN")
|
@@ -35,6 +41,12 @@ class Task(Enum):
|
|
35 |
TRUTHFULNESS = "sosyoloji_truthfulness"
|
36 |
NLI = "nli"
|
37 |
MATH = "math"
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
|
40 |
class DeepEvalTaskManager:
|
@@ -139,7 +151,37 @@ class DeepEvalTaskManager:
|
|
139 |
res = math_task.evaluate()
|
140 |
return res
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
if __name__ == "__main__":
|
143 |
-
des = DeepEvalTaskManager("google/gemma-
|
144 |
res = des.run_tasks()
|
145 |
print(res)
|
|
|
15 |
from src.deepeval.truthfulness_task import TruthfulnessTask
|
16 |
from src.deepeval.nli import NLITask
|
17 |
from src.deepeval.math import MathTask
|
18 |
+
from src.deepeval.turkish_vocabulary import TurkishVocabularyTask
|
19 |
+
from src.deepeval.metaphors_and_idioms import MetaphorsAndIdiomsTask
|
20 |
+
from src.deepeval.topic_detection import TopicDetectionTask
|
21 |
+
from src.deepeval.sts import STSTask
|
22 |
+
from src.deepeval.mmlu import MMLUTask
|
23 |
+
from src.deepeval.bias import BiasTask
|
24 |
from typing import List
|
25 |
load_dotenv()
|
26 |
HF_TOKEN=os.getenv("HF_TOKEN")
|
|
|
41 |
TRUTHFULNESS = "sosyoloji_truthfulness"
|
42 |
NLI = "nli"
|
43 |
MATH = "math"
|
44 |
+
TURKISH_VOCABULARY = "turkish_vocabulary"
|
45 |
+
METAPHORS_AND_IDIOMS = "metaphors_and_idioms"
|
46 |
+
TOPIC_DETECTION = "topic_detection"
|
47 |
+
STS = "sts"
|
48 |
+
MMLU = "mmlu"
|
49 |
+
BIAS_MC = "bias"
|
50 |
|
51 |
|
52 |
class DeepEvalTaskManager:
|
|
|
151 |
res = math_task.evaluate()
|
152 |
return res
|
153 |
|
154 |
+
def turkish_vocabulary(self):
|
155 |
+
turkish_vocabulary_task = TurkishVocabularyTask(self.model_name)
|
156 |
+
res = turkish_vocabulary_task.evaluate()
|
157 |
+
return res
|
158 |
+
|
159 |
+
def metaphors_and_idioms(self):
|
160 |
+
metaphors_and_idioms_task = MetaphorsAndIdiomsTask(self.model_name)
|
161 |
+
res = metaphors_and_idioms_task.evaluate()
|
162 |
+
return res
|
163 |
+
|
164 |
+
def topic_detection(self):
|
165 |
+
topic_detection_task = TopicDetectionTask(self.model_name)
|
166 |
+
res = topic_detection_task.evaluate()
|
167 |
+
return res
|
168 |
+
|
169 |
+
def sts(self):
|
170 |
+
sts_task = STSTask(self.model_name)
|
171 |
+
res = sts_task.evaluate()
|
172 |
+
return res
|
173 |
+
|
174 |
+
def mmlu(self):
|
175 |
+
mmlu_task = MMLUTask(self.model_name)
|
176 |
+
res = mmlu_task.evaluate()
|
177 |
+
return res
|
178 |
+
|
179 |
+
def bias(self):
|
180 |
+
bias_task = BiasTask(self.model_name)
|
181 |
+
res = bias_task.evaluate()
|
182 |
+
return res
|
183 |
+
|
184 |
if __name__ == "__main__":
|
185 |
+
des = DeepEvalTaskManager("google/gemma-2b-it", ["MMLU"])
|
186 |
res = des.run_tasks()
|
187 |
print(res)
|
src/deepeval/math.py
CHANGED
@@ -10,7 +10,47 @@ class MathTask(BaseTask):
|
|
10 |
|
11 |
def load_dataset_from_hf(self):
|
12 |
dataset = super().load_dataset_from_hf()
|
13 |
-
return dataset.select(range(min(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
|
16 |
def evaluate(self) -> dict[str, Any]:
|
@@ -31,23 +71,29 @@ class MathTask(BaseTask):
|
|
31 |
print("Type of answer:", type(answer))
|
32 |
|
33 |
# Construct the prompt/message
|
34 |
-
instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu içinde verin.
|
35 |
|
36 |
Nihai Cevap için Uyulması Gereken Format Kuralları:
|
37 |
|
38 |
-
1.
|
39 |
-
2. Kesirleri her zaman en sade halde verilmeli.
|
40 |
- Matris içi kesirler: x/y biçiminde.
|
41 |
- Diğer tüm kesirler: \\frac{{x}}{{y}} biçiminde.
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
47 |
|
48 |
Görev: Problemi çözün, son adımda yukarıdaki kurallara tam uyan tek bir kutu içinde nihai cevabı verin.
|
49 |
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
51 |
prompt = f"{instruction}\n\nSoru:\n{row["question"]}\n"
|
52 |
message = prompt
|
53 |
|
|
|
10 |
|
11 |
def load_dataset_from_hf(self):
|
12 |
dataset = super().load_dataset_from_hf()
|
13 |
+
return dataset.select(range(min(1, len(dataset))))
|
14 |
+
|
15 |
+
def generate_response_oeqa_multi_token(self, msg,max_new_tokens: int = 128):
|
16 |
+
"""
|
17 |
+
Handles multiple-choice questions where answers might have multiple tokens.
|
18 |
+
"""
|
19 |
+
# Ensure tokenizer has proper special tokens set
|
20 |
+
if self.tokenizer.pad_token is None:
|
21 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
22 |
+
|
23 |
+
if self.model.config.pad_token_id is None:
|
24 |
+
self.model.config.pad_token_id = self.tokenizer.pad_token_id
|
25 |
+
|
26 |
+
chat = [
|
27 |
+
{"role": "user", "content": "You are a question-answering chatbot."},
|
28 |
+
{"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
|
29 |
+
{"role": "user", "content": f"{msg}"},
|
30 |
+
]
|
31 |
+
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
32 |
+
print(formatted_chat)
|
33 |
+
|
34 |
+
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
35 |
+
input_ids = inputs.input_ids.to(self.model.device)
|
36 |
+
attention_mask = inputs.attention_mask.to(self.model.device)
|
37 |
+
|
38 |
+
# Generate response with proper token limits
|
39 |
+
output = self.model.generate(
|
40 |
+
input_ids,
|
41 |
+
do_sample=True,
|
42 |
+
attention_mask=attention_mask,
|
43 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
44 |
+
pad_token_id=self.tokenizer.pad_token_id,
|
45 |
+
temperature=0.4,
|
46 |
+
max_new_tokens=max_new_tokens,
|
47 |
+
)
|
48 |
+
|
49 |
+
generated_ids = output[0] # The generated sequence including the prompt
|
50 |
+
generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
|
51 |
+
generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
52 |
+
|
53 |
+
return generated_text
|
54 |
|
55 |
|
56 |
def evaluate(self) -> dict[str, Any]:
|
|
|
71 |
print("Type of answer:", type(answer))
|
72 |
|
73 |
# Construct the prompt/message
|
74 |
+
instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu (\\boxed{{}}) içinde verin.
|
75 |
|
76 |
Nihai Cevap için Uyulması Gereken Format Kuralları:
|
77 |
|
78 |
+
1. Kesirler her zaman en sade hallerinde verilmeli.
|
|
|
79 |
- Matris içi kesirler: x/y biçiminde.
|
80 |
- Diğer tüm kesirler: \\frac{{x}}{{y}} biçiminde.
|
81 |
+
2. Çarpma işareti (*) kullanılmamalı. Örnek: 2x yazın, 2**x* değil.
|
82 |
+
3. Birden çok değişken varsa alfabetik sıraya uyulmalı ve (x, y, z...), polinomları azalan derece sırasına göre yazılmalı.
|
83 |
+
4. Her zaman aynı gösterim biçimi kullanılmalı. Ondalık yerine kesir kullanılmalı (ör. 0.5 yerine \\frac{{1}}{{2}} ).
|
84 |
+
5. Faktörize polinomlar daima aynı faktör sırası ile verilsin; her sorguda aynı cevabı verecek şekilde tutarlılığı koruyun.
|
85 |
+
6. Nihai cevabı kutu dışında tekrar etmeyin, biçimi değiştirmeyin. Aynı soru tekrarlandığında aynı formatı ve cevabı verin.
|
86 |
+
7. Nihai cevap, tek seferde \\boxed{{...}} içinde verilmeli. Örnek: Cevap x ise, "\\boxed{{x}}".
|
87 |
+
|
88 |
|
89 |
Görev: Problemi çözün, son adımda yukarıdaki kurallara tam uyan tek bir kutu içinde nihai cevabı verin.
|
90 |
|
91 |
+
|
92 |
+
Çözüm:
|
93 |
+
|
94 |
+
|
95 |
+
Nihai cevap:
|
96 |
+
"""
|
97 |
prompt = f"{instruction}\n\nSoru:\n{row["question"]}\n"
|
98 |
message = prompt
|
99 |
|
src/deepeval/metaphors_and_idioms.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import os
|
6 |
+
import ast
|
7 |
+
import re
|
8 |
+
from datasets import load_dataset,get_dataset_split_names
|
9 |
+
HF_TOKEN=os.getenv("HF_TOKEN")
|
10 |
+
|
11 |
+
class MetaphorsAndIdiomsTask(BaseTask):
|
12 |
+
def __init__(self, model_name):
|
13 |
+
super().__init__("metunlp/metaphors_and_idioms", model_name=model_name)
|
14 |
+
|
15 |
+
def load_dataset_from_hf(self):
|
16 |
+
dataset = super().load_dataset_from_hf()
|
17 |
+
return dataset # dataset.select(range(min(10, len(dataset))))
|
18 |
+
|
19 |
+
def evaluate(self) -> dict[str, Any]:
|
20 |
+
responses = []
|
21 |
+
difficulty_results = defaultdict(lambda: defaultdict(lambda: {'correct': 0, 'total': 0}))
|
22 |
+
|
23 |
+
total_count = 0
|
24 |
+
true = 0
|
25 |
+
|
26 |
+
|
27 |
+
for row in self.dataset:
|
28 |
+
total_count += 1
|
29 |
+
|
30 |
+
# Get values from row
|
31 |
+
category = "hard" if row["level"]== 1 else "easy" if row["level"] == 0 else None
|
32 |
+
answer_index = row["answer"]
|
33 |
+
correct_answer_letter = chr(65 + answer_index)
|
34 |
+
context = row["context"]
|
35 |
+
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
36 |
+
formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
|
37 |
+
subset = row["idiom_type"]
|
38 |
+
|
39 |
+
if subset == "atasözü":
|
40 |
+
question = "Aşağıda verilen durum hangi atasözü ile en iyi ifade edilebilir?"
|
41 |
+
elif subset == "deyim":
|
42 |
+
question = """Verilen bağlamda "[MASKED]" ile boş bırakılan yere hangi deyim getirilirse cümlenin akışı anlamlı olur?"""
|
43 |
+
else:
|
44 |
+
question = "Aşağıda verilen durum hangi atasözü ile en iyi ifade edilebilir?"
|
45 |
+
|
46 |
+
# Prints for debugging
|
47 |
+
print(f"Difficulty: {category}")
|
48 |
+
print("Type of difficulty:", type(category))
|
49 |
+
print(f"Answer: {correct_answer_letter}")
|
50 |
+
print("Type of answer:", type(answer_index))
|
51 |
+
|
52 |
+
# Construct the prompt/message
|
53 |
+
instruction = ""
|
54 |
+
prompt = f"Soru: {question}\nBağlam: {context}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
55 |
+
message = prompt
|
56 |
+
|
57 |
+
# Get/format answer of the model
|
58 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
59 |
+
responses.append(model_answer)
|
60 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
61 |
+
|
62 |
+
# Print answers
|
63 |
+
print(f"Correct Answer: {correct_answer_letter}")
|
64 |
+
print(f"Model Answer: {model_answer}")
|
65 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
66 |
+
print(f"Result: {correct_answer_letter == model_answer_cleaned}")
|
67 |
+
|
68 |
+
# Check if correct based on metric
|
69 |
+
if correct_answer_letter == model_answer_cleaned:
|
70 |
+
true += 1
|
71 |
+
difficulty_results[subset][category]['correct'] += 1
|
72 |
+
|
73 |
+
difficulty_results[subset][category]['total'] += 1
|
74 |
+
|
75 |
+
# Print results categorized by difficulty
|
76 |
+
for subset in difficulty_results.keys():
|
77 |
+
subset_results = difficulty_results[subset]
|
78 |
+
for category, stats in subset_results.items():
|
79 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
80 |
+
print(f"{subset.capitalize()} {category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
81 |
+
|
82 |
+
print("Results:", responses)
|
83 |
+
print("Overall Accuracy:", true / total_count)
|
84 |
+
acc = accuracy(true, total_count)
|
85 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
86 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
87 |
+
|
src/deepeval/mmlu.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import os
|
6 |
+
import ast
|
7 |
+
import re
|
8 |
+
from datasets import load_dataset,get_dataset_config_names
|
9 |
+
HF_TOKEN=os.getenv("HF_TOKEN")
|
10 |
+
|
11 |
+
class MMLUTask(BaseTask):
|
12 |
+
def __init__(self, model_name):
|
13 |
+
self.subsets = get_dataset_config_names("metunlp/mmlu_tr")
|
14 |
+
print(self.subsets)
|
15 |
+
super().__init__("metunlp/mmlu_tr", model_name=model_name)
|
16 |
+
|
17 |
+
def load_dataset_from_hf(self):
|
18 |
+
evaluate_count = 1
|
19 |
+
print("Loading dataset from Hugging Face.")
|
20 |
+
dataset_dict = {}
|
21 |
+
for subset in self.subsets:
|
22 |
+
subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
|
23 |
+
dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
|
24 |
+
print("Dataset loaded.")
|
25 |
+
return dataset_dict
|
26 |
+
|
27 |
+
|
28 |
+
def evaluate(self) -> dict[str, Any]:
|
29 |
+
responses = []
|
30 |
+
difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
|
31 |
+
|
32 |
+
total_count = 0
|
33 |
+
true = 0
|
34 |
+
|
35 |
+
for subset in self.subsets:
|
36 |
+
curr_dataset = self.dataset[subset]
|
37 |
+
print(curr_dataset[0])
|
38 |
+
|
39 |
+
for row in curr_dataset:
|
40 |
+
total_count += 1
|
41 |
+
|
42 |
+
# Get values from row
|
43 |
+
question = row["question"]
|
44 |
+
answer_index = row["answer"]
|
45 |
+
correct_answer_letter = chr(65 + answer_index)
|
46 |
+
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
47 |
+
formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
|
48 |
+
|
49 |
+
|
50 |
+
# Prints for debugging
|
51 |
+
print(f"Answer: {correct_answer_letter}")
|
52 |
+
print("Type of answer:", type(answer_index))
|
53 |
+
|
54 |
+
# Construct the prompt/message
|
55 |
+
instruction = f"Aşağıda {row["subject"]} konusunda çoktan seçmeli bir soru verilmiştir."
|
56 |
+
prompt = f"{instruction}\n\nSoru: {question}\nSeçenekler:\n{formatted_choices}\n\n"
|
57 |
+
message = prompt
|
58 |
+
|
59 |
+
# Get/format answer of the model
|
60 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
61 |
+
responses.append(model_answer)
|
62 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
63 |
+
|
64 |
+
# Print answers
|
65 |
+
print(f"Correct Answer: {correct_answer_letter}")
|
66 |
+
print(f"Model Answer: {model_answer}")
|
67 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
68 |
+
print(f"Result: {correct_answer_letter == model_answer_cleaned}")
|
69 |
+
|
70 |
+
# Check if correct based on metric
|
71 |
+
if correct_answer_letter == model_answer_cleaned:
|
72 |
+
true += 1
|
73 |
+
difficulty_results[subset]['correct'] += 1
|
74 |
+
|
75 |
+
difficulty_results[subset]['total'] += 1
|
76 |
+
|
77 |
+
# Print results categorized by subset
|
78 |
+
for category, stats in difficulty_results.items():
|
79 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
80 |
+
print(f"{subset.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
81 |
+
|
82 |
+
print("Results:", responses)
|
83 |
+
print("Overall Accuracy:", true / total_count)
|
84 |
+
acc = accuracy(true, total_count)
|
85 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
86 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
87 |
+
|
src/deepeval/sts.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import re
|
6 |
+
from datasets import load_dataset
|
7 |
+
import os
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
import openai
|
10 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList, LogitsProcessor
|
11 |
+
import torch
|
12 |
+
from typing import List
|
13 |
+
|
14 |
+
class STSTask(BaseTask):
|
15 |
+
def __init__(self, model_name):
|
16 |
+
super().__init__("metunlp/sts_tr", model_name=model_name)
|
17 |
+
|
18 |
+
def load_dataset_from_hf(self):
|
19 |
+
dataset = super().load_dataset_from_hf()
|
20 |
+
return dataset.select(range(min(1, len(dataset))))
|
21 |
+
|
22 |
+
def generate_response_sts_multi_token(self, msg, max_new_tokens=5, choices: list = []):
|
23 |
+
"""
|
24 |
+
Handles multiple-choice questions where answers might have multiple tokens.
|
25 |
+
"""
|
26 |
+
# Ensure tokenizer has proper special tokens set
|
27 |
+
if self.tokenizer.pad_token is None:
|
28 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
29 |
+
|
30 |
+
if self.model.config.pad_token_id is None:
|
31 |
+
self.model.config.pad_token_id = self.tokenizer.pad_token_id
|
32 |
+
|
33 |
+
chat = [
|
34 |
+
{"role": "user",
|
35 |
+
"content": "You are a sentence similarity scoring chatbot. Only respond with one of the given scores: 0, 1, 2, 3, 4, or 5."},
|
36 |
+
{"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
|
37 |
+
{"role": "user", "content": f"{msg}"},
|
38 |
+
]
|
39 |
+
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
40 |
+
print(formatted_chat)
|
41 |
+
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
42 |
+
input_ids = inputs.input_ids.to(self.model.device)
|
43 |
+
attention_mask = inputs.attention_mask.to(self.model.device)
|
44 |
+
|
45 |
+
# Generate the sequence of letters starting from 'A'
|
46 |
+
letters = ["0","1","2","3","4","5"]
|
47 |
+
encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
|
48 |
+
flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list
|
49 |
+
print(flattened_encoded_choices)
|
50 |
+
|
51 |
+
allowed_tokens = flattened_encoded_choices
|
52 |
+
allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
|
53 |
+
allowed_token_ids = set(allowed_tokens) # Ensure uniqueness
|
54 |
+
|
55 |
+
# Custom LogitsProcessor to restrict generation
|
56 |
+
class RestrictToABCDLogitsProcessor(LogitsProcessor):
|
57 |
+
def __call__(self, input_ids, scores):
|
58 |
+
mask = torch.full_like(scores, float("-inf")) # Block all tokens
|
59 |
+
mask[:, list(allowed_token_ids)] = scores[:, list(allowed_token_ids)] # Allow only A, B, C, D tokens
|
60 |
+
return mask
|
61 |
+
|
62 |
+
logits_processor = LogitsProcessorList([RestrictToABCDLogitsProcessor()])
|
63 |
+
|
64 |
+
# Generate response
|
65 |
+
output = self.model.generate(
|
66 |
+
input_ids,
|
67 |
+
do_sample=True,
|
68 |
+
attention_mask=attention_mask,
|
69 |
+
max_new_tokens=max_new_tokens,
|
70 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
71 |
+
pad_token_id=self.tokenizer.pad_token_id,
|
72 |
+
temperature=0.4,
|
73 |
+
logits_processor=logits_processor,
|
74 |
+
)
|
75 |
+
generated_ids = output[0] # The generated sequence including the prompt
|
76 |
+
generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
|
77 |
+
generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
78 |
+
return generated_text
|
79 |
+
|
80 |
+
def evaluate(self) -> dict[str, Any]:
|
81 |
+
responses = []
|
82 |
+
difficulty_results = {'correct': 0, 'total': 0}
|
83 |
+
|
84 |
+
total_count = 0
|
85 |
+
true = 0
|
86 |
+
|
87 |
+
for row in self.dataset:
|
88 |
+
total_count += 1
|
89 |
+
|
90 |
+
# Get values from row
|
91 |
+
answer = row["score"]
|
92 |
+
choices = ["0","1","2","3","4","5"]
|
93 |
+
|
94 |
+
# Prints for debugging
|
95 |
+
print(f"Answer: {answer}")
|
96 |
+
print("Type of answer:", type(answer))
|
97 |
+
|
98 |
+
# Construct the prompt/message
|
99 |
+
instruction = f"Aşağıda verilen iki cümlenin birbirlerine olan anlamsal benzerliğini 0'dan 5'e kadar olan bir tam sayıyla söyleyin."
|
100 |
+
prompt = f"""{instruction}\nCümle 1: {row["sentence_1"]}\nCümle 2: {row["sentence_2"]}\nSadece tek bir tam sayı söyleyin, ek bir kelime ya da sembol kullanmayın."""
|
101 |
+
message = prompt
|
102 |
+
|
103 |
+
# Get/format answer of the model
|
104 |
+
model_answer = self.generate_response_sts_multi_token(message, max_new_tokens=2)
|
105 |
+
responses.append(model_answer)
|
106 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
107 |
+
|
108 |
+
# Print answers
|
109 |
+
print(f"Correct Answer: {answer}")
|
110 |
+
print(f"Model Answer: {model_answer}")
|
111 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
112 |
+
print(f"Result: {answer == model_answer_cleaned}")
|
113 |
+
|
114 |
+
# Check if correct based on metric
|
115 |
+
if answer == model_answer_cleaned:
|
116 |
+
true += 1
|
117 |
+
difficulty_results['correct'] += 1
|
118 |
+
|
119 |
+
difficulty_results['total'] += 1
|
120 |
+
|
121 |
+
# Print results
|
122 |
+
stats = difficulty_results
|
123 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
124 |
+
print(f"Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
125 |
+
|
126 |
+
print("Results:", responses)
|
127 |
+
print("Overall Accuracy:", true / total_count)
|
128 |
+
acc = accuracy(true, total_count)
|
129 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
130 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
131 |
+
|
src/deepeval/topic_detection.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import ast
|
6 |
+
|
7 |
+
|
8 |
+
class TopicDetectionTask(BaseTask):
|
9 |
+
def __init__(self, model_name):
|
10 |
+
super().__init__("metunlp/topic_detection_tr", model_name=model_name)
|
11 |
+
|
12 |
+
def load_dataset_from_hf(self):
|
13 |
+
dataset = super().load_dataset_from_hf()
|
14 |
+
return dataset.select(range(min(10, len(dataset))))
|
15 |
+
|
16 |
+
|
17 |
+
def evaluate(self) -> dict[str, Any]:
|
18 |
+
responses = []
|
19 |
+
difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
|
20 |
+
total_count = 0
|
21 |
+
true = 0
|
22 |
+
|
23 |
+
for row in self.dataset:
|
24 |
+
total_count += 1
|
25 |
+
|
26 |
+
# Get values from row
|
27 |
+
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
28 |
+
formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
|
29 |
+
category = row["level"].lower().replace(' ','')
|
30 |
+
answer = row["answer"]
|
31 |
+
text = row["text"]
|
32 |
+
|
33 |
+
# Prints for debugging
|
34 |
+
print(f"Choices: {choices}")
|
35 |
+
print("Type of choices:", type(choices))
|
36 |
+
print("Type of answer:", type(answer))
|
37 |
+
|
38 |
+
# Get answer index (starting from 0)
|
39 |
+
if type(answer) == int:
|
40 |
+
answer_index = answer
|
41 |
+
else:
|
42 |
+
answer_index = int(answer)
|
43 |
+
correct_answer_letter = chr(65 + answer_index)
|
44 |
+
|
45 |
+
|
46 |
+
# Construct the prompt/message
|
47 |
+
instruction = "Aşağıdaki metni analiz et ve seçeneklerden bu metnin en olası kategorisini belirle. Temaya ve detaylara dikkat ederek metnin ana fikrini göz önünde bulundurarak soruyu cevapla."
|
48 |
+
prompt = f"{instruction}\n\nMetin:\n{text}\nSeçenekler:\n{formatted_choices}\n\n"
|
49 |
+
message = prompt
|
50 |
+
|
51 |
+
# Get/format answer of the model
|
52 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
53 |
+
responses.append(model_answer)
|
54 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
55 |
+
|
56 |
+
# Print answers
|
57 |
+
print(f"Correct Answer: {correct_answer_letter}")
|
58 |
+
print(f"Model Answer: {model_answer}")
|
59 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
60 |
+
print(f"Result: {correct_answer_letter == model_answer_cleaned}")
|
61 |
+
|
62 |
+
# Check if correct based on metric
|
63 |
+
if correct_answer_letter == model_answer_cleaned:
|
64 |
+
true += 1
|
65 |
+
difficulty_results[category]['correct'] += 1
|
66 |
+
|
67 |
+
difficulty_results[category]['total'] += 1
|
68 |
+
|
69 |
+
# Print results categorized by difficulty
|
70 |
+
for category, stats in difficulty_results.items():
|
71 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
72 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
73 |
+
|
74 |
+
print("Results:", responses)
|
75 |
+
print("Overall Accuracy:", true / total_count)
|
76 |
+
acc = accuracy(true, total_count)
|
77 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
78 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
79 |
+
|
src/deepeval/turkish_vocabulary.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import os
|
6 |
+
import ast
|
7 |
+
import re
|
8 |
+
from datasets import load_dataset,get_dataset_split_names
|
9 |
+
HF_TOKEN=os.getenv("HF_TOKEN")
|
10 |
+
|
11 |
+
class TurkishVocabularyTask(BaseTask):
|
12 |
+
def __init__(self, model_name):
|
13 |
+
self.subsets = ["rare", "loan"]
|
14 |
+
super().__init__("metunlp/turkish_vocabulary", model_name=model_name)
|
15 |
+
|
16 |
+
def load_dataset_from_hf(self):
|
17 |
+
evaluate_count = 1
|
18 |
+
print("Loading dataset from Hugging Face.")
|
19 |
+
dataset_dict = {}
|
20 |
+
for subset in self.subsets:
|
21 |
+
subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
|
22 |
+
dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
|
23 |
+
print("Dataset loaded.")
|
24 |
+
return dataset_dict
|
25 |
+
|
26 |
+
|
27 |
+
def evaluate(self) -> dict[str, Any]:
|
28 |
+
responses = []
|
29 |
+
difficulty_results = defaultdict(lambda: defaultdict(lambda: {'correct': 0, 'total': 0}))
|
30 |
+
|
31 |
+
total_count = 0
|
32 |
+
true = 0
|
33 |
+
|
34 |
+
for subset in self.subsets:
|
35 |
+
curr_dataset = self.dataset[subset]
|
36 |
+
print(curr_dataset[0])
|
37 |
+
|
38 |
+
# Determine the question based on the subset
|
39 |
+
if subset == "rare":
|
40 |
+
question = "Verilen kelimenin eş anlamlısı aşağıdakilerden hangisidir?"
|
41 |
+
elif subset == "loan":
|
42 |
+
question = "Verilen kelimenin Türkçe kökenli eş anlamlısı aşağıdakilerden hangisidir?"
|
43 |
+
else:
|
44 |
+
question = "Verilen kelimenin eş anlamlısı aşağıdakilerden hangisidir?"
|
45 |
+
|
46 |
+
for row in curr_dataset:
|
47 |
+
total_count += 1
|
48 |
+
|
49 |
+
# Get values from row
|
50 |
+
category = "hard" if row["level"]== 1 else "easy" if row["level"] == 0 else None
|
51 |
+
answer_index = row["answer"]
|
52 |
+
correct_answer_letter = chr(65 + answer_index)
|
53 |
+
word = row["word"]
|
54 |
+
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
55 |
+
formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
# Prints for debugging
|
60 |
+
print(f"Difficulty: {category}")
|
61 |
+
print("Type of difficulty:", type(category))
|
62 |
+
print(f"Answer: {correct_answer_letter}")
|
63 |
+
print("Type of answer:", type(answer_index))
|
64 |
+
|
65 |
+
# Construct the prompt/message
|
66 |
+
instruction = ""
|
67 |
+
prompt = f"Soru: {question}\nKelime: {word}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
68 |
+
message = prompt
|
69 |
+
|
70 |
+
# Get/format answer of the model
|
71 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
72 |
+
responses.append(model_answer)
|
73 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
74 |
+
|
75 |
+
# Print answers
|
76 |
+
print(f"Correct Answer: {correct_answer_letter}")
|
77 |
+
print(f"Model Answer: {model_answer}")
|
78 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
79 |
+
print(f"Result: {correct_answer_letter == model_answer_cleaned}")
|
80 |
+
|
81 |
+
# Check if correct based on metric
|
82 |
+
if correct_answer_letter == model_answer_cleaned:
|
83 |
+
true += 1
|
84 |
+
difficulty_results[subset][category]['correct'] += 1
|
85 |
+
|
86 |
+
difficulty_results[subset][category]['total'] += 1
|
87 |
+
|
88 |
+
# Print results categorized by difficulty
|
89 |
+
for subset in self.subsets:
|
90 |
+
subset_results = difficulty_results[subset]
|
91 |
+
for category, stats in subset_results.items():
|
92 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
93 |
+
print(f"{subset.capitalize()} {category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
94 |
+
|
95 |
+
print("Results:", responses)
|
96 |
+
print("Overall Accuracy:", true / total_count)
|
97 |
+
acc = accuracy(true, total_count)
|
98 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
99 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
100 |
+
|