Spaces:
Sleeping
Sleeping
Upload 9 files
Browse files- src/deepeval/bias.py +98 -0
- src/deepeval/math.py +128 -0
- src/deepeval/metaphors_and_idioms.py +87 -0
- src/deepeval/mmlu.py +87 -0
- src/deepeval/ner.py +166 -0
- src/deepeval/pos.py +159 -0
- src/deepeval/sts.py +131 -0
- src/deepeval/topic_detection.py +79 -0
- src/deepeval/turkish_vocabulary.py +100 -0
src/deepeval/bias.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import ast
|
6 |
+
|
7 |
+
|
8 |
+
class BiasTask(BaseTask):
|
9 |
+
def __init__(self, model_name):
|
10 |
+
super().__init__("metunlp/sosyoloji_bias", model_name=model_name)
|
11 |
+
|
12 |
+
def load_dataset_from_hf(self):
|
13 |
+
dataset = super().load_dataset_from_hf()
|
14 |
+
return dataset.select(range(min(1, len(dataset))))
|
15 |
+
|
16 |
+
|
17 |
+
def evaluate(self) -> dict[str, Any]:
|
18 |
+
responses = []
|
19 |
+
difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
|
20 |
+
total_count = 0
|
21 |
+
true = 0
|
22 |
+
|
23 |
+
for row in self.dataset:
|
24 |
+
total_count += 2
|
25 |
+
|
26 |
+
# Get values from row
|
27 |
+
|
28 |
+
## common
|
29 |
+
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
30 |
+
formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
|
31 |
+
|
32 |
+
## ambiguous context
|
33 |
+
ambiguous_context = row["ambiguous_context"]
|
34 |
+
ambiguous_question = row["question_ambiguous"]
|
35 |
+
ambiguous_answer = row["answer_ambiguous"]
|
36 |
+
ambiguous_correct_answer_letter = chr(64 + ambiguous_answer) # 65 - 1 since we need the indexing to start from 0
|
37 |
+
ambiguous_prompt = f"Bağlam: {ambiguous_context}\nSoru: {ambiguous_question}\nSeçenekler:{formatted_choices}"
|
38 |
+
|
39 |
+
## disambiguated context
|
40 |
+
disambiguated_context = row["disambiguated_context"]
|
41 |
+
disambiguated_question = row["question_disambiguated"]
|
42 |
+
disambiguated_answer = row["answer_disambiguated"]
|
43 |
+
disambiguated_correct_answer_letter = chr(64 + disambiguated_answer) # 65 - 1 since we need the indexing to start from 0
|
44 |
+
disambiguated_prompt = f"Bağlam: {disambiguated_context}\nSoru: {disambiguated_question}\nSeçenekler:{formatted_choices}"
|
45 |
+
|
46 |
+
# Prints for debugging
|
47 |
+
print(f"Choices: {choices}")
|
48 |
+
print("Type of choices:", type(choices))
|
49 |
+
|
50 |
+
# STAGE 1
|
51 |
+
instruction = ""
|
52 |
+
prompt = f"Stage1:\n{ambiguous_prompt}\n{instruction}\n"
|
53 |
+
message = prompt
|
54 |
+
ambiguous_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
55 |
+
ambiguous_model_answer_cleaned = ambiguous_model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
56 |
+
## Print answers
|
57 |
+
print(f"Correct Answer: {ambiguous_correct_answer_letter}")
|
58 |
+
print(f"Model Answer: {ambiguous_model_answer}")
|
59 |
+
print(f"Model Answer Cleaned: {ambiguous_model_answer_cleaned}")
|
60 |
+
print(f"Result: {ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned}")
|
61 |
+
## Check if correct based on metric
|
62 |
+
if ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned:
|
63 |
+
true += 1
|
64 |
+
difficulty_results["ambiguous"]['correct'] += 1
|
65 |
+
|
66 |
+
difficulty_results["ambiguous"]['total'] += 1
|
67 |
+
|
68 |
+
# STAGE 2
|
69 |
+
instruction = ""
|
70 |
+
prompt = f"Stage2:\n{disambiguated_prompt}\n{instruction}\n"
|
71 |
+
message = prompt
|
72 |
+
disambiguated_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
73 |
+
disambiguated_model_answer_cleaned = disambiguated_model_answer.strip().replace('\n', '').replace(' ','').upper().replace(':', '')
|
74 |
+
## Print answers
|
75 |
+
print(f"Correct Answer: {disambiguated_correct_answer_letter}")
|
76 |
+
print(f"Model Answer: {disambiguated_model_answer}")
|
77 |
+
print(f"Model Answer Cleaned: {disambiguated_model_answer_cleaned}")
|
78 |
+
print(f"Result: {disambiguated_correct_answer_letter == disambiguated_model_answer_cleaned}")
|
79 |
+
responses.append((ambiguous_model_answer_cleaned,disambiguated_model_answer_cleaned))
|
80 |
+
|
81 |
+
## Check if correct based on metric
|
82 |
+
if disambiguated_correct_answer_letter == disambiguated_model_answer_cleaned:
|
83 |
+
true += 1
|
84 |
+
difficulty_results["disambiguated"]['correct'] += 1
|
85 |
+
|
86 |
+
difficulty_results["disambiguated"]['total'] += 1
|
87 |
+
|
88 |
+
# Print results categorized by difficulty
|
89 |
+
for category, stats in difficulty_results.items():
|
90 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
91 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
92 |
+
|
93 |
+
print("Results:", responses)
|
94 |
+
print("Overall Accuracy:", true / total_count)
|
95 |
+
acc = accuracy(true, total_count)
|
96 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
97 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
98 |
+
|
src/deepeval/math.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import re
|
6 |
+
|
7 |
+
class MathTask(BaseTask):
|
8 |
+
def __init__(self, model_name):
|
9 |
+
super().__init__("metunlp/math_tr", model_name=model_name)
|
10 |
+
|
11 |
+
def load_dataset_from_hf(self):
|
12 |
+
dataset = super().load_dataset_from_hf()
|
13 |
+
return dataset.select(range(min(1, len(dataset))))
|
14 |
+
|
15 |
+
def generate_response_oeqa_multi_token(self, msg,max_new_tokens: int = 128):
|
16 |
+
"""
|
17 |
+
Handles multiple-choice questions where answers might have multiple tokens.
|
18 |
+
"""
|
19 |
+
# Ensure tokenizer has proper special tokens set
|
20 |
+
if self.tokenizer.pad_token is None:
|
21 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
22 |
+
|
23 |
+
if self.model.config.pad_token_id is None:
|
24 |
+
self.model.config.pad_token_id = self.tokenizer.pad_token_id
|
25 |
+
|
26 |
+
chat = [
|
27 |
+
{"role": "user", "content": "You are a question-answering chatbot."},
|
28 |
+
{"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
|
29 |
+
{"role": "user", "content": f"{msg}"},
|
30 |
+
]
|
31 |
+
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
32 |
+
print(formatted_chat)
|
33 |
+
|
34 |
+
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
35 |
+
input_ids = inputs.input_ids.to(self.model.device)
|
36 |
+
attention_mask = inputs.attention_mask.to(self.model.device)
|
37 |
+
|
38 |
+
# Generate response with proper token limits
|
39 |
+
output = self.model.generate(
|
40 |
+
input_ids,
|
41 |
+
do_sample=True,
|
42 |
+
attention_mask=attention_mask,
|
43 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
44 |
+
pad_token_id=self.tokenizer.pad_token_id,
|
45 |
+
temperature=0.4,
|
46 |
+
max_new_tokens=max_new_tokens,
|
47 |
+
)
|
48 |
+
|
49 |
+
generated_ids = output[0] # The generated sequence including the prompt
|
50 |
+
generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
|
51 |
+
generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
52 |
+
|
53 |
+
return generated_text
|
54 |
+
|
55 |
+
|
56 |
+
def evaluate(self) -> dict[str, Any]:
|
57 |
+
responses = []
|
58 |
+
difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
|
59 |
+
total_count = 0
|
60 |
+
true = 0
|
61 |
+
|
62 |
+
for row in self.dataset:
|
63 |
+
total_count += 1
|
64 |
+
|
65 |
+
# Get values from row
|
66 |
+
category = str(row["difficulty"])
|
67 |
+
answer = row["final_answer"]
|
68 |
+
|
69 |
+
# Prints for debugging
|
70 |
+
print(f"Answer: {answer}")
|
71 |
+
print("Type of answer:", type(answer))
|
72 |
+
|
73 |
+
# Construct the prompt/message
|
74 |
+
instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu (\\boxed{{}}) içinde verin.
|
75 |
+
|
76 |
+
Nihai Cevap için Uyulması Gereken Format Kuralları:
|
77 |
+
|
78 |
+
1. Kesirler her zaman en sade hallerinde verilmeli.
|
79 |
+
- Matris içi kesirler: x/y biçiminde.
|
80 |
+
- Diğer tüm kesirler: \\frac{{x}}{{y}} biçiminde.
|
81 |
+
2. Çarpma işareti (*) kullanılmamalı. Örnek: 2x yazın, 2**x* değil.
|
82 |
+
3. Birden çok değişken varsa alfabetik sıraya uyulmalı ve (x, y, z...), polinomları azalan derece sırasına göre yazılmalı.
|
83 |
+
4. Her zaman aynı gösterim biçimi kullanılmalı. Ondalık yerine kesir kullanılmalı (ör. 0.5 yerine \\frac{{1}}{{2}} ).
|
84 |
+
5. Faktörize polinomlar daima aynı faktör sırası ile verilsin; her sorguda aynı cevabı verecek şekilde tutarlılığı koruyun.
|
85 |
+
6. Nihai cevabı kutu dışında tekrar etmeyin, biçimi değiştirmeyin. Aynı soru tekrarlandığında aynı formatı ve cevabı verin.
|
86 |
+
7. Nihai cevap, tek seferde \\boxed{{...}} içinde verilmeli. Örnek: Cevap x ise, "\\boxed{{x}}".
|
87 |
+
|
88 |
+
|
89 |
+
Görev: Problemi çözün, son adımda yukarıdaki kurallara tam uyan tek bir kutu içinde nihai cevabı verin.
|
90 |
+
|
91 |
+
|
92 |
+
Çözüm:
|
93 |
+
|
94 |
+
|
95 |
+
Nihai cevap:
|
96 |
+
"""
|
97 |
+
prompt = f"{instruction}\n\nSoru:\n{row["question"]}\n"
|
98 |
+
message = prompt
|
99 |
+
|
100 |
+
# Get/format answer of the model
|
101 |
+
model_answer = self.generate_response_oeqa_multi_token(message)
|
102 |
+
responses.append(model_answer)
|
103 |
+
model_answer_cleaned = re.search(r"\\boxed{([^}]*)}", model_answer)
|
104 |
+
|
105 |
+
# Print answers
|
106 |
+
print(f"Correct Answer: {answer}")
|
107 |
+
print(f"Model Answer: {model_answer}")
|
108 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
109 |
+
print(f"Result: {answer == model_answer_cleaned}")
|
110 |
+
|
111 |
+
# Check if correct based on metric
|
112 |
+
if answer == model_answer_cleaned:
|
113 |
+
true += 1
|
114 |
+
difficulty_results[category]['correct'] += 1
|
115 |
+
|
116 |
+
difficulty_results[category]['total'] += 1
|
117 |
+
|
118 |
+
# Print results categorized by difficulty
|
119 |
+
for category, stats in difficulty_results.items():
|
120 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
121 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
122 |
+
|
123 |
+
print("Results:", responses)
|
124 |
+
print("Overall Accuracy:", true / total_count)
|
125 |
+
acc = accuracy(true, total_count)
|
126 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
127 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
128 |
+
|
src/deepeval/metaphors_and_idioms.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import os
|
6 |
+
import ast
|
7 |
+
import re
|
8 |
+
from datasets import load_dataset,get_dataset_split_names
|
9 |
+
HF_TOKEN=os.getenv("HF_TOKEN")
|
10 |
+
|
11 |
+
class MetaphorsAndIdiomsTask(BaseTask):
|
12 |
+
def __init__(self, model_name):
|
13 |
+
super().__init__("metunlp/metaphors_and_idioms", model_name=model_name)
|
14 |
+
|
15 |
+
def load_dataset_from_hf(self):
|
16 |
+
dataset = super().load_dataset_from_hf()
|
17 |
+
return dataset # dataset.select(range(min(10, len(dataset))))
|
18 |
+
|
19 |
+
def evaluate(self) -> dict[str, Any]:
|
20 |
+
responses = []
|
21 |
+
difficulty_results = defaultdict(lambda: defaultdict(lambda: {'correct': 0, 'total': 0}))
|
22 |
+
|
23 |
+
total_count = 0
|
24 |
+
true = 0
|
25 |
+
|
26 |
+
|
27 |
+
for row in self.dataset:
|
28 |
+
total_count += 1
|
29 |
+
|
30 |
+
# Get values from row
|
31 |
+
category = "hard" if row["level"]== 1 else "easy" if row["level"] == 0 else None
|
32 |
+
answer_index = row["answer"]
|
33 |
+
correct_answer_letter = chr(65 + answer_index)
|
34 |
+
context = row["context"]
|
35 |
+
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
36 |
+
formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
|
37 |
+
subset = row["idiom_type"]
|
38 |
+
|
39 |
+
if subset == "atasözü":
|
40 |
+
question = "Aşağıda verilen durum hangi atasözü ile en iyi ifade edilebilir?"
|
41 |
+
elif subset == "deyim":
|
42 |
+
question = """Verilen bağlamda "[MASKED]" ile boş bırakılan yere hangi deyim getirilirse cümlenin akışı anlamlı olur?"""
|
43 |
+
else:
|
44 |
+
question = "Aşağıda verilen durum hangi atasözü ile en iyi ifade edilebilir?"
|
45 |
+
|
46 |
+
# Prints for debugging
|
47 |
+
print(f"Difficulty: {category}")
|
48 |
+
print("Type of difficulty:", type(category))
|
49 |
+
print(f"Answer: {correct_answer_letter}")
|
50 |
+
print("Type of answer:", type(answer_index))
|
51 |
+
|
52 |
+
# Construct the prompt/message
|
53 |
+
instruction = ""
|
54 |
+
prompt = f"Soru: {question}\nBağlam: {context}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
55 |
+
message = prompt
|
56 |
+
|
57 |
+
# Get/format answer of the model
|
58 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
59 |
+
responses.append(model_answer)
|
60 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
61 |
+
|
62 |
+
# Print answers
|
63 |
+
print(f"Correct Answer: {correct_answer_letter}")
|
64 |
+
print(f"Model Answer: {model_answer}")
|
65 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
66 |
+
print(f"Result: {correct_answer_letter == model_answer_cleaned}")
|
67 |
+
|
68 |
+
# Check if correct based on metric
|
69 |
+
if correct_answer_letter == model_answer_cleaned:
|
70 |
+
true += 1
|
71 |
+
difficulty_results[subset][category]['correct'] += 1
|
72 |
+
|
73 |
+
difficulty_results[subset][category]['total'] += 1
|
74 |
+
|
75 |
+
# Print results categorized by difficulty
|
76 |
+
for subset in difficulty_results.keys():
|
77 |
+
subset_results = difficulty_results[subset]
|
78 |
+
for category, stats in subset_results.items():
|
79 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
80 |
+
print(f"{subset.capitalize()} {category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
81 |
+
|
82 |
+
print("Results:", responses)
|
83 |
+
print("Overall Accuracy:", true / total_count)
|
84 |
+
acc = accuracy(true, total_count)
|
85 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
86 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
87 |
+
|
src/deepeval/mmlu.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import os
|
6 |
+
import ast
|
7 |
+
import re
|
8 |
+
from datasets import load_dataset,get_dataset_config_names
|
9 |
+
HF_TOKEN=os.getenv("HF_TOKEN")
|
10 |
+
|
11 |
+
class MMLUTask(BaseTask):
|
12 |
+
def __init__(self, model_name):
|
13 |
+
self.subsets = get_dataset_config_names("metunlp/mmlu_tr")
|
14 |
+
print(self.subsets)
|
15 |
+
super().__init__("metunlp/mmlu_tr", model_name=model_name)
|
16 |
+
|
17 |
+
def load_dataset_from_hf(self):
|
18 |
+
evaluate_count = 1
|
19 |
+
print("Loading dataset from Hugging Face.")
|
20 |
+
dataset_dict = {}
|
21 |
+
for subset in self.subsets:
|
22 |
+
subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
|
23 |
+
dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
|
24 |
+
print("Dataset loaded.")
|
25 |
+
return dataset_dict
|
26 |
+
|
27 |
+
|
28 |
+
def evaluate(self) -> dict[str, Any]:
|
29 |
+
responses = []
|
30 |
+
difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
|
31 |
+
|
32 |
+
total_count = 0
|
33 |
+
true = 0
|
34 |
+
|
35 |
+
for subset in self.subsets:
|
36 |
+
curr_dataset = self.dataset[subset]
|
37 |
+
print(curr_dataset[0])
|
38 |
+
|
39 |
+
for row in curr_dataset:
|
40 |
+
total_count += 1
|
41 |
+
|
42 |
+
# Get values from row
|
43 |
+
question = row["question"]
|
44 |
+
answer_index = row["answer"]
|
45 |
+
correct_answer_letter = chr(65 + answer_index)
|
46 |
+
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
47 |
+
formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
|
48 |
+
|
49 |
+
|
50 |
+
# Prints for debugging
|
51 |
+
print(f"Answer: {correct_answer_letter}")
|
52 |
+
print("Type of answer:", type(answer_index))
|
53 |
+
|
54 |
+
# Construct the prompt/message
|
55 |
+
instruction = f"Aşağıda {row["subject"]} konusunda çoktan seçmeli bir soru verilmiştir."
|
56 |
+
prompt = f"{instruction}\n\nSoru: {question}\nSeçenekler:\n{formatted_choices}\n\n"
|
57 |
+
message = prompt
|
58 |
+
|
59 |
+
# Get/format answer of the model
|
60 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
61 |
+
responses.append(model_answer)
|
62 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
63 |
+
|
64 |
+
# Print answers
|
65 |
+
print(f"Correct Answer: {correct_answer_letter}")
|
66 |
+
print(f"Model Answer: {model_answer}")
|
67 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
68 |
+
print(f"Result: {correct_answer_letter == model_answer_cleaned}")
|
69 |
+
|
70 |
+
# Check if correct based on metric
|
71 |
+
if correct_answer_letter == model_answer_cleaned:
|
72 |
+
true += 1
|
73 |
+
difficulty_results[subset]['correct'] += 1
|
74 |
+
|
75 |
+
difficulty_results[subset]['total'] += 1
|
76 |
+
|
77 |
+
# Print results categorized by subset
|
78 |
+
for category, stats in difficulty_results.items():
|
79 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
80 |
+
print(f"{subset.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
81 |
+
|
82 |
+
print("Results:", responses)
|
83 |
+
print("Overall Accuracy:", true / total_count)
|
84 |
+
acc = accuracy(true, total_count)
|
85 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
86 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
87 |
+
|
src/deepeval/ner.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import re
|
6 |
+
|
7 |
+
class NERTask(BaseTask):
|
8 |
+
def __init__(self, model_name):
|
9 |
+
super().__init__("metunlp/tr_ner", model_name=model_name)
|
10 |
+
|
11 |
+
def load_dataset_from_hf(self):
|
12 |
+
dataset = super().load_dataset_from_hf()
|
13 |
+
return dataset.select(range(min(1, len(dataset))))
|
14 |
+
|
15 |
+
def generate_response_oeqa_multi_token(self, msg,max_new_tokens: int = 128):
|
16 |
+
"""
|
17 |
+
Handles multiple-choice questions where answers might have multiple tokens.
|
18 |
+
"""
|
19 |
+
# Ensure tokenizer has proper special tokens set
|
20 |
+
if self.tokenizer.pad_token is None:
|
21 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
22 |
+
|
23 |
+
if self.model.config.pad_token_id is None:
|
24 |
+
self.model.config.pad_token_id = self.tokenizer.pad_token_id
|
25 |
+
|
26 |
+
chat = [
|
27 |
+
{"role": "user", "content": "You are a question-answering chatbot."},
|
28 |
+
{"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
|
29 |
+
{"role": "user", "content": f"{msg}"},
|
30 |
+
]
|
31 |
+
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
32 |
+
print(formatted_chat)
|
33 |
+
|
34 |
+
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
35 |
+
input_ids = inputs.input_ids.to(self.model.device)
|
36 |
+
attention_mask = inputs.attention_mask.to(self.model.device)
|
37 |
+
|
38 |
+
|
39 |
+
# Generate response with proper token limits
|
40 |
+
output = self.model.generate(
|
41 |
+
input_ids,
|
42 |
+
do_sample=True,
|
43 |
+
attention_mask=attention_mask,
|
44 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
45 |
+
pad_token_id=self.tokenizer.pad_token_id,
|
46 |
+
temperature=0.4,
|
47 |
+
max_new_tokens=max_new_tokens,
|
48 |
+
)
|
49 |
+
|
50 |
+
generated_ids = output[0] # The generated sequence including the prompt
|
51 |
+
generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
|
52 |
+
generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
53 |
+
|
54 |
+
return generated_text
|
55 |
+
|
56 |
+
|
57 |
+
def evaluate(self) -> dict[str, Any]:
|
58 |
+
responses = []
|
59 |
+
difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
|
60 |
+
total_count = 0
|
61 |
+
true = 0
|
62 |
+
|
63 |
+
for row in self.dataset:
|
64 |
+
total_count += 1
|
65 |
+
|
66 |
+
# Get values from row
|
67 |
+
category = str(row["difficulty"])
|
68 |
+
answer = row["final_answer"]
|
69 |
+
|
70 |
+
# Prints for debugging
|
71 |
+
print(f"Answer: {answer}")
|
72 |
+
print("Type of answer:", type(answer))
|
73 |
+
|
74 |
+
# Construct the prompt/message
|
75 |
+
instruction = ("Aşağıdaki Named Entity Recognition (NER) için etiketlenmesi gereken cümleler vardır. "
|
76 |
+
"Cümlelerdeki varlıkları belirleyin ve şu kategorilere ayırın: CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PER, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, TITLE, WORK_OF_ART. "
|
77 |
+
""
|
78 |
+
"Varlıklar, anlamlı bilgiler içeren terimlerdir ve aşağıdaki şekilde tanımlanır: "
|
79 |
+
"CARDINAL: Nicelik veya sıralama belirtmeyen sayısal ifadeler."
|
80 |
+
"DATE: Belirli bir tarih veya zaman ifadeleri."
|
81 |
+
"EVENT: Adlandırılmış olaylar veya durumlar."
|
82 |
+
"FAC: Binalar veya önemli yerler gibi tesisler."
|
83 |
+
"GPE: Ülke, şehir veya eyalet gibi coğrafi-politik varlıklar."
|
84 |
+
"LANGUAGE: Adlandırılmış diller."
|
85 |
+
"LAW: Yasal belgeler, düzenlemeler veya kanunlar."
|
86 |
+
"LOC: Coğrafi veya fiziksel konumlar (GPE dışındaki)."
|
87 |
+
"MONEY: Parasal değerler."
|
88 |
+
"NORP: Milletler, dini veya siyasi gruplar."
|
89 |
+
"ORDINAL: Sıralama veya dereceler."
|
90 |
+
"ORG: Organizasyonlar veya kurumlar."
|
91 |
+
"PER: Kişisel unvanlar veya sıfatlar."
|
92 |
+
"PERSON: Bireylerin isimleri."
|
93 |
+
"PRODUCT: Üretilen nesneler veya araçlar."
|
94 |
+
"QUANTITY: Ölçülebilir miktarlar ve birimler."
|
95 |
+
"TIME: Günün belirli saatleri."
|
96 |
+
"TITLE: Kişi unvanları."
|
97 |
+
"WORK_OF_ART: Sanat eserleri, kitaplar, müzik vb. Adlar, tarih ifadeleri, konumlar gibi belirgin bilgiler varlıktır."
|
98 |
+
""
|
99 |
+
"Fiiller, sıfatlar, zarflar, soyut kavramlar gibi ifadeler varlık değildir. Çıktıyı aşağıdaki JSON formatında döndürün. "
|
100 |
+
""
|
101 |
+
"Örnekler: "
|
102 |
+
"Girdi: "
|
103 |
+
"sentence: \"Üç yıl aradan sonra gerçekleştirilen ve Karadeniz, Ege ve Akdeniz’de düzenlenecek olan tatbikata ilişkin Yunanistan'ın Kathimerini gazetesi 'Türk-Yunan: Çetin donanma dengesinin gücü' başlığını kullandı.\""
|
104 |
+
"Çıktı: "
|
105 |
+
"Üç yıl,DATE"
|
106 |
+
"Karadeniz,LOC"
|
107 |
+
"Ege,LOC"
|
108 |
+
"Akdeniz,LOC"
|
109 |
+
"Yunanistan,GPE"
|
110 |
+
"Kathimerini,ORG"
|
111 |
+
"Türk,NORP"
|
112 |
+
""
|
113 |
+
"Girdi:"
|
114 |
+
"sentence: \"Evlendikten sonra oyunculuğu bırakan Makal, geçen yıl eşi ve oğluyla beraber İstanbul’dan Göcek’e taşınmıştı."
|
115 |
+
"Çıktı: "
|
116 |
+
"Makal,PERSON"
|
117 |
+
"İstanbul,GPE"
|
118 |
+
"Göcek,GPE"
|
119 |
+
""
|
120 |
+
"Girdi:"
|
121 |
+
"sentence: \"Yeşil-kırmızılılardan 2016’da ayrılıp 3 sezonluk aradan sonra 2019’da geri dönen Sarıca, takımına 2021 yılında Şampiyonlar Ligi’nde, 2023’te de Süper Lig’de iki final oynattı."
|
122 |
+
"Çıktı:"
|
123 |
+
"2016’da,DATE"
|
124 |
+
"3,CARDINAL"
|
125 |
+
"2019’da,DATE"
|
126 |
+
"Sarıca,PERSON"
|
127 |
+
"2021,DATE"
|
128 |
+
"Şampiyonlar Ligi’nde,EVENT"
|
129 |
+
"2023’te,DATE"
|
130 |
+
"Süper Lig’de,EVENT"
|
131 |
+
"iki,CARDINAL"
|
132 |
+
""
|
133 |
+
"Verilen cümlelerdeki her varlığı csv formatında yukarıdaki örneklere benzer şekilde belirleyin. Çıktıdaki her satırı aşağıdaki gibi oluşturun: "
|
134 |
+
"<Varlık metni>,<Varlık etiketi>"),
|
135 |
+
prompt = f"{instruction}\n\nSoru:\n{row["question"]}\n"
|
136 |
+
message = prompt
|
137 |
+
|
138 |
+
# Get/format answer of the model
|
139 |
+
model_answer = self.generate_response_oeqa_multi_token(message)
|
140 |
+
responses.append(model_answer)
|
141 |
+
model_answer_cleaned = model_answer
|
142 |
+
|
143 |
+
# Print answers
|
144 |
+
print(f"Correct Answer: {answer}")
|
145 |
+
print(f"Model Answer: {model_answer}")
|
146 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
147 |
+
print(f"Result: {answer == model_answer_cleaned}")
|
148 |
+
|
149 |
+
# Check if correct based on metric
|
150 |
+
if answer == model_answer_cleaned:
|
151 |
+
true += 1
|
152 |
+
difficulty_results[category]['correct'] += 1
|
153 |
+
|
154 |
+
difficulty_results[category]['total'] += 1
|
155 |
+
|
156 |
+
# Print results categorized by difficulty
|
157 |
+
for category, stats in difficulty_results.items():
|
158 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
159 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
160 |
+
|
161 |
+
print("Results:", responses)
|
162 |
+
print("Overall Accuracy:", true / total_count)
|
163 |
+
acc = accuracy(true, total_count)
|
164 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
165 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
166 |
+
|
src/deepeval/pos.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import re
|
6 |
+
|
7 |
+
class POSTask(BaseTask):
|
8 |
+
def __init__(self, model_name):
|
9 |
+
super().__init__("metunlp/tr_pos", model_name=model_name)
|
10 |
+
|
11 |
+
def load_dataset_from_hf(self):
|
12 |
+
dataset = super().load_dataset_from_hf()
|
13 |
+
return dataset.select(range(min(1, len(dataset))))
|
14 |
+
|
15 |
+
def generate_response_oeqa_multi_token(self, msg,max_new_tokens: int = 128):
|
16 |
+
"""
|
17 |
+
Handles multiple-choice questions where answers might have multiple tokens.
|
18 |
+
"""
|
19 |
+
# Ensure tokenizer has proper special tokens set
|
20 |
+
if self.tokenizer.pad_token is None:
|
21 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
22 |
+
|
23 |
+
if self.model.config.pad_token_id is None:
|
24 |
+
self.model.config.pad_token_id = self.tokenizer.pad_token_id
|
25 |
+
|
26 |
+
chat = [
|
27 |
+
{"role": "user", "content": "You are a question-answering chatbot."},
|
28 |
+
{"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
|
29 |
+
{"role": "user", "content": f"{msg}"},
|
30 |
+
]
|
31 |
+
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
32 |
+
print(formatted_chat)
|
33 |
+
|
34 |
+
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
35 |
+
input_ids = inputs.input_ids.to(self.model.device)
|
36 |
+
attention_mask = inputs.attention_mask.to(self.model.device)
|
37 |
+
prompt = ("Aşağıdaki Named Entity Recognition (NER) için etiketlenmesi gereken cümleler vardır. "
|
38 |
+
"Cümlelerdeki varlıkları belirleyin ve şu kategorilere ayırın: CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PER, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, TITLE, WORK_OF_ART. "
|
39 |
+
""
|
40 |
+
"Varlıklar, anlamlı bilgiler içeren terimlerdir ve aşağıdaki şekilde tanımlanır: "
|
41 |
+
"CARDINAL: Nicelik veya sıralama belirtmeyen sayısal ifadeler."
|
42 |
+
"DATE: Belirli bir tarih veya zaman ifadeleri."
|
43 |
+
"EVENT: Adlandırılmış olaylar veya durumlar."
|
44 |
+
"FAC: Binalar veya önemli yerler gibi tesisler."
|
45 |
+
"GPE: Ülke, şehir veya eyalet gibi coğrafi-politik varlıklar."
|
46 |
+
"LANGUAGE: Adlandırılmış diller."
|
47 |
+
"LAW: Yasal belgeler, düzenlemeler veya kanunlar."
|
48 |
+
"LOC: Coğrafi veya fiziksel konumlar (GPE dışındaki)."
|
49 |
+
"MONEY: Parasal değerler."
|
50 |
+
"NORP: Milletler, dini veya siyasi gruplar."
|
51 |
+
"ORDINAL: Sıralama veya dereceler."
|
52 |
+
"ORG: Organizasyonlar veya kurumlar."
|
53 |
+
"PER: Kişisel unvanlar veya sıfatlar."
|
54 |
+
"PERSON: Bireylerin isimleri."
|
55 |
+
"PRODUCT: Üretilen nesneler veya araçlar."
|
56 |
+
"QUANTITY: Ölçülebilir miktarlar ve birimler."
|
57 |
+
"TIME: Günün belirli saatleri."
|
58 |
+
"TITLE: Kişi unvanları."
|
59 |
+
"WORK_OF_ART: Sanat eserleri, kitaplar, müzik vb. Adlar, tarih ifadeleri, konumlar gibi belirgin bilgiler varlıktır."
|
60 |
+
""
|
61 |
+
"Fiiller, sıfatlar, zarflar, soyut kavramlar gibi ifadeler varlık değildir. Çıktıyı aşağıdaki JSON formatında döndürün. "
|
62 |
+
""
|
63 |
+
"Örnekler: "
|
64 |
+
"Girdi: "
|
65 |
+
"\"sentence\": \"Üç yıl aradan sonra gerçekleştirilen ve Karadeniz, Ege ve Akdeniz’de düzenlenecek olan tatbikata ilişkin Yunanistan'ın Kathimerini gazetesi 'Türk-Yunan: Çetin donanma dengesinin gücü' başlığını kullandı.\""
|
66 |
+
"Çıktı: "
|
67 |
+
"Üç yıl: DATE\" }, { \"text\": \"Karadeniz\", \"label\": \"LOC\" }, { \"text\": \"Ege\", \"label\": \"LOC\" }, { \"text\": \"Akdeniz\", \"label\": \"LOC\" }, { \"text\": \"Yunanistan\", \"label\": \"GPE\" }, { \"text\": \"Kathimerini\", \"label\": \"ORG\" }, { \"text\": \"Türk\", \"label\": \"NORP\" }]} Girdi: {\"sentence\": \"Evlendikten sonra oyunculuğu bırakan Makal, geçen yıl eşi ve oğluyla beraber İstanbul’dan Göcek’e taşınmıştı.\"} Çıktı: {\"entities\": [{ \"text\": \"Makal\", \"label\": \"PERSON\" }, { \"text\": \"İstanbul\", \"label\": \"GPE\" }, { \"text\": \"Göcek\", \"label\": \"GPE\" }]} Girdi: {\"sentence\": \"Yeşil-kırmızılılardan 2016’da ayrılıp 3 sezonluk aradan sonra 2019’da geri dönen Sarıca, takımına 2021 yılında Şampiyonlar Ligi’nde, 2023’te de Süper Lig’de iki final oynattı.\"} Çıktı: {\"entities\": [{ \"text\": \"2016’da\", \"label\": \"DATE\" }, { \"text\": \"3\", \"label\": \"CARDINAL\" }, { \"text\": \"2019’da\", \"label\": \"DATE\" }, { \"text\": \"Sarıca\", \"label\": \"PERSON\" }, { \"text\": \"2021\", \"label\": \"DATE\" }, { \"text\": \"Şampiyonlar Ligi’nde\", \"label\": \"EVENT\" }, { \"text\": \"2023’te\", \"label\": \"DATE\" }, { \"text\": \"Süper Lig’de\", \"label\": \"EVENT\" }, { \"text\": \"iki\", \"label\": \"CARDINAL\" }]}. Verilen cümlelerdeki varlıkları JSON formatında yukarıdaki örneklere benzer şekilde belirleyin. Çıktıyı aşağıdaki gibi oluşturun: Girdi Formatı: {\"sentence\": \"<CÜMLE>\"} Çıktı Formatı: {\"entities\": [{ \"text\": \"<Varlık metni>\", \"label\": \"<Varlık etiketi>\" }]}"),
|
68 |
+
|
69 |
+
# Generate response with proper token limits
|
70 |
+
output = self.model.generate(
|
71 |
+
input_ids,
|
72 |
+
do_sample=True,
|
73 |
+
attention_mask=attention_mask,
|
74 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
75 |
+
pad_token_id=self.tokenizer.pad_token_id,
|
76 |
+
temperature=0.4,
|
77 |
+
max_new_tokens=max_new_tokens,
|
78 |
+
)
|
79 |
+
|
80 |
+
generated_ids = output[0] # The generated sequence including the prompt
|
81 |
+
generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
|
82 |
+
generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
83 |
+
|
84 |
+
return generated_text
|
85 |
+
|
86 |
+
|
87 |
+
def evaluate(self) -> dict[str, Any]:
|
88 |
+
responses = []
|
89 |
+
difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
|
90 |
+
total_count = 0
|
91 |
+
true = 0
|
92 |
+
|
93 |
+
for row in self.dataset:
|
94 |
+
total_count += 1
|
95 |
+
|
96 |
+
# Get values from row
|
97 |
+
category = str(row["difficulty"])
|
98 |
+
answer = row["final_answer"]
|
99 |
+
|
100 |
+
# Prints for debugging
|
101 |
+
print(f"Answer: {answer}")
|
102 |
+
print("Type of answer:", type(answer))
|
103 |
+
|
104 |
+
# Construct the prompt/message
|
105 |
+
instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu (\\boxed{{}}) içinde verin.
|
106 |
+
|
107 |
+
Nihai Cevap için Uyulması Gereken Format Kuralları:
|
108 |
+
|
109 |
+
1. Kesirler her zaman en sade hallerinde verilmeli.
|
110 |
+
- Matris içi kesirler: x/y biçiminde.
|
111 |
+
- Diğer tüm kesirler: \\frac{{x}}{{y}} biçiminde.
|
112 |
+
2. Çarpma işareti (*) kullanılmamalı. Örnek: 2x yazın, 2**x* değil.
|
113 |
+
3. Birden çok değişken varsa alfabetik sıraya uyulmalı ve (x, y, z...), polinomları azalan derece sırasına göre yazılmalı.
|
114 |
+
4. Her zaman aynı gösterim biçimi kullanılmalı. Ondalık yerine kesir kullanılmalı (ör. 0.5 yerine \\frac{{1}}{{2}} ).
|
115 |
+
5. Faktörize polinomlar daima aynı faktör sırası ile verilsin; her sorguda aynı cevabı verecek şekilde tutarlılığı koruyun.
|
116 |
+
6. Nihai cevabı kutu dışında tekrar etmeyin, biçimi değiştirmeyin. Aynı soru tekrarlandığında aynı formatı ve cevabı verin.
|
117 |
+
7. Nihai cevap, tek seferde \\boxed{{...}} içinde verilmeli. Örnek: Cevap x ise, "\\boxed{{x}}".
|
118 |
+
|
119 |
+
|
120 |
+
Görev: Problemi çözün, son adımda yukarıdaki kurallara tam uyan tek bir kutu içinde nihai cevabı verin.
|
121 |
+
|
122 |
+
|
123 |
+
Çözüm:
|
124 |
+
|
125 |
+
|
126 |
+
Nihai cevap:
|
127 |
+
"""
|
128 |
+
prompt = f"{instruction}\n\nSoru:\n{row["question"]}\n"
|
129 |
+
message = prompt
|
130 |
+
|
131 |
+
# Get/format answer of the model
|
132 |
+
model_answer = self.generate_response_oeqa_multi_token(message)
|
133 |
+
responses.append(model_answer)
|
134 |
+
model_answer_cleaned = re.search(r"\\boxed{([^}]*)}", model_answer)
|
135 |
+
|
136 |
+
# Print answers
|
137 |
+
print(f"Correct Answer: {answer}")
|
138 |
+
print(f"Model Answer: {model_answer}")
|
139 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
140 |
+
print(f"Result: {answer == model_answer_cleaned}")
|
141 |
+
|
142 |
+
# Check if correct based on metric
|
143 |
+
if answer == model_answer_cleaned:
|
144 |
+
true += 1
|
145 |
+
difficulty_results[category]['correct'] += 1
|
146 |
+
|
147 |
+
difficulty_results[category]['total'] += 1
|
148 |
+
|
149 |
+
# Print results categorized by difficulty
|
150 |
+
for category, stats in difficulty_results.items():
|
151 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
152 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
153 |
+
|
154 |
+
print("Results:", responses)
|
155 |
+
print("Overall Accuracy:", true / total_count)
|
156 |
+
acc = accuracy(true, total_count)
|
157 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
158 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
159 |
+
|
src/deepeval/sts.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import re
|
6 |
+
from datasets import load_dataset
|
7 |
+
import os
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
import openai
|
10 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList, LogitsProcessor
|
11 |
+
import torch
|
12 |
+
from typing import List
|
13 |
+
|
14 |
+
class STSTask(BaseTask):
|
15 |
+
def __init__(self, model_name):
|
16 |
+
super().__init__("metunlp/sts_tr", model_name=model_name)
|
17 |
+
|
18 |
+
def load_dataset_from_hf(self):
|
19 |
+
dataset = super().load_dataset_from_hf()
|
20 |
+
return dataset.select(range(min(1, len(dataset))))
|
21 |
+
|
22 |
+
def generate_response_sts_multi_token(self, msg, max_new_tokens=5, choices: list = []):
|
23 |
+
"""
|
24 |
+
Handles multiple-choice questions where answers might have multiple tokens.
|
25 |
+
"""
|
26 |
+
# Ensure tokenizer has proper special tokens set
|
27 |
+
if self.tokenizer.pad_token is None:
|
28 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
29 |
+
|
30 |
+
if self.model.config.pad_token_id is None:
|
31 |
+
self.model.config.pad_token_id = self.tokenizer.pad_token_id
|
32 |
+
|
33 |
+
chat = [
|
34 |
+
{"role": "user",
|
35 |
+
"content": "You are a sentence similarity scoring chatbot. Only respond with one of the given scores: 0, 1, 2, 3, 4, or 5."},
|
36 |
+
{"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
|
37 |
+
{"role": "user", "content": f"{msg}"},
|
38 |
+
]
|
39 |
+
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
40 |
+
print(formatted_chat)
|
41 |
+
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
42 |
+
input_ids = inputs.input_ids.to(self.model.device)
|
43 |
+
attention_mask = inputs.attention_mask.to(self.model.device)
|
44 |
+
|
45 |
+
# Generate the sequence of letters starting from 'A'
|
46 |
+
letters = ["0","1","2","3","4","5"]
|
47 |
+
encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
|
48 |
+
flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list
|
49 |
+
print(flattened_encoded_choices)
|
50 |
+
|
51 |
+
allowed_tokens = flattened_encoded_choices
|
52 |
+
allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
|
53 |
+
allowed_token_ids = set(allowed_tokens) # Ensure uniqueness
|
54 |
+
|
55 |
+
# Custom LogitsProcessor to restrict generation
|
56 |
+
class RestrictToABCDLogitsProcessor(LogitsProcessor):
|
57 |
+
def __call__(self, input_ids, scores):
|
58 |
+
mask = torch.full_like(scores, float("-inf")) # Block all tokens
|
59 |
+
mask[:, list(allowed_token_ids)] = scores[:, list(allowed_token_ids)] # Allow only A, B, C, D tokens
|
60 |
+
return mask
|
61 |
+
|
62 |
+
logits_processor = LogitsProcessorList([RestrictToABCDLogitsProcessor()])
|
63 |
+
|
64 |
+
# Generate response
|
65 |
+
output = self.model.generate(
|
66 |
+
input_ids,
|
67 |
+
do_sample=True,
|
68 |
+
attention_mask=attention_mask,
|
69 |
+
max_new_tokens=max_new_tokens,
|
70 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
71 |
+
pad_token_id=self.tokenizer.pad_token_id,
|
72 |
+
temperature=0.4,
|
73 |
+
logits_processor=logits_processor,
|
74 |
+
)
|
75 |
+
generated_ids = output[0] # The generated sequence including the prompt
|
76 |
+
generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
|
77 |
+
generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
78 |
+
return generated_text
|
79 |
+
|
80 |
+
def evaluate(self) -> dict[str, Any]:
|
81 |
+
responses = []
|
82 |
+
difficulty_results = {'correct': 0, 'total': 0}
|
83 |
+
|
84 |
+
total_count = 0
|
85 |
+
true = 0
|
86 |
+
|
87 |
+
for row in self.dataset:
|
88 |
+
total_count += 1
|
89 |
+
|
90 |
+
# Get values from row
|
91 |
+
answer = row["score"]
|
92 |
+
choices = ["0","1","2","3","4","5"]
|
93 |
+
|
94 |
+
# Prints for debugging
|
95 |
+
print(f"Answer: {answer}")
|
96 |
+
print("Type of answer:", type(answer))
|
97 |
+
|
98 |
+
# Construct the prompt/message
|
99 |
+
instruction = f"Aşağıda verilen iki cümlenin birbirlerine olan anlamsal benzerliğini 0'dan 5'e kadar olan bir tam sayıyla söyleyin."
|
100 |
+
prompt = f"""{instruction}\nCümle 1: {row["sentence_1"]}\nCümle 2: {row["sentence_2"]}\nSadece tek bir tam sayı söyleyin, ek bir kelime ya da sembol kullanmayın."""
|
101 |
+
message = prompt
|
102 |
+
|
103 |
+
# Get/format answer of the model
|
104 |
+
model_answer = self.generate_response_sts_multi_token(message, max_new_tokens=2)
|
105 |
+
responses.append(model_answer)
|
106 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
107 |
+
|
108 |
+
# Print answers
|
109 |
+
print(f"Correct Answer: {answer}")
|
110 |
+
print(f"Model Answer: {model_answer}")
|
111 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
112 |
+
print(f"Result: {answer == model_answer_cleaned}")
|
113 |
+
|
114 |
+
# Check if correct based on metric
|
115 |
+
if answer == model_answer_cleaned:
|
116 |
+
true += 1
|
117 |
+
difficulty_results['correct'] += 1
|
118 |
+
|
119 |
+
difficulty_results['total'] += 1
|
120 |
+
|
121 |
+
# Print results
|
122 |
+
stats = difficulty_results
|
123 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
124 |
+
print(f"Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
125 |
+
|
126 |
+
print("Results:", responses)
|
127 |
+
print("Overall Accuracy:", true / total_count)
|
128 |
+
acc = accuracy(true, total_count)
|
129 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
130 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
131 |
+
|
src/deepeval/topic_detection.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import ast
|
6 |
+
|
7 |
+
|
8 |
+
class TopicDetectionTask(BaseTask):
|
9 |
+
def __init__(self, model_name):
|
10 |
+
super().__init__("metunlp/topic_detection_tr", model_name=model_name)
|
11 |
+
|
12 |
+
def load_dataset_from_hf(self):
|
13 |
+
dataset = super().load_dataset_from_hf()
|
14 |
+
return dataset.select(range(min(10, len(dataset))))
|
15 |
+
|
16 |
+
|
17 |
+
def evaluate(self) -> dict[str, Any]:
|
18 |
+
responses = []
|
19 |
+
difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
|
20 |
+
total_count = 0
|
21 |
+
true = 0
|
22 |
+
|
23 |
+
for row in self.dataset:
|
24 |
+
total_count += 1
|
25 |
+
|
26 |
+
# Get values from row
|
27 |
+
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
28 |
+
formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
|
29 |
+
category = row["level"].lower().replace(' ','')
|
30 |
+
answer = row["answer"]
|
31 |
+
text = row["text"]
|
32 |
+
|
33 |
+
# Prints for debugging
|
34 |
+
print(f"Choices: {choices}")
|
35 |
+
print("Type of choices:", type(choices))
|
36 |
+
print("Type of answer:", type(answer))
|
37 |
+
|
38 |
+
# Get answer index (starting from 0)
|
39 |
+
if type(answer) == int:
|
40 |
+
answer_index = answer
|
41 |
+
else:
|
42 |
+
answer_index = int(answer)
|
43 |
+
correct_answer_letter = chr(65 + answer_index)
|
44 |
+
|
45 |
+
|
46 |
+
# Construct the prompt/message
|
47 |
+
instruction = "Aşağıdaki metni analiz et ve seçeneklerden bu metnin en olası kategorisini belirle. Temaya ve detaylara dikkat ederek metnin ana fikrini göz önünde bulundurarak soruyu cevapla."
|
48 |
+
prompt = f"{instruction}\n\nMetin:\n{text}\nSeçenekler:\n{formatted_choices}\n\n"
|
49 |
+
message = prompt
|
50 |
+
|
51 |
+
# Get/format answer of the model
|
52 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
53 |
+
responses.append(model_answer)
|
54 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
55 |
+
|
56 |
+
# Print answers
|
57 |
+
print(f"Correct Answer: {correct_answer_letter}")
|
58 |
+
print(f"Model Answer: {model_answer}")
|
59 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
60 |
+
print(f"Result: {correct_answer_letter == model_answer_cleaned}")
|
61 |
+
|
62 |
+
# Check if correct based on metric
|
63 |
+
if correct_answer_letter == model_answer_cleaned:
|
64 |
+
true += 1
|
65 |
+
difficulty_results[category]['correct'] += 1
|
66 |
+
|
67 |
+
difficulty_results[category]['total'] += 1
|
68 |
+
|
69 |
+
# Print results categorized by difficulty
|
70 |
+
for category, stats in difficulty_results.items():
|
71 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
72 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
73 |
+
|
74 |
+
print("Results:", responses)
|
75 |
+
print("Overall Accuracy:", true / total_count)
|
76 |
+
acc = accuracy(true, total_count)
|
77 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
78 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
79 |
+
|
src/deepeval/turkish_vocabulary.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import os
|
6 |
+
import ast
|
7 |
+
import re
|
8 |
+
from datasets import load_dataset,get_dataset_split_names
|
9 |
+
HF_TOKEN=os.getenv("HF_TOKEN")
|
10 |
+
|
11 |
+
class TurkishVocabularyTask(BaseTask):
|
12 |
+
def __init__(self, model_name):
|
13 |
+
self.subsets = ["rare", "loan"]
|
14 |
+
super().__init__("metunlp/turkish_vocabulary", model_name=model_name)
|
15 |
+
|
16 |
+
def load_dataset_from_hf(self):
|
17 |
+
evaluate_count = 1
|
18 |
+
print("Loading dataset from Hugging Face.")
|
19 |
+
dataset_dict = {}
|
20 |
+
for subset in self.subsets:
|
21 |
+
subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
|
22 |
+
dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
|
23 |
+
print("Dataset loaded.")
|
24 |
+
return dataset_dict
|
25 |
+
|
26 |
+
|
27 |
+
def evaluate(self) -> dict[str, Any]:
|
28 |
+
responses = []
|
29 |
+
difficulty_results = defaultdict(lambda: defaultdict(lambda: {'correct': 0, 'total': 0}))
|
30 |
+
|
31 |
+
total_count = 0
|
32 |
+
true = 0
|
33 |
+
|
34 |
+
for subset in self.subsets:
|
35 |
+
curr_dataset = self.dataset[subset]
|
36 |
+
print(curr_dataset[0])
|
37 |
+
|
38 |
+
# Determine the question based on the subset
|
39 |
+
if subset == "rare":
|
40 |
+
question = "Verilen kelimenin eş anlamlısı aşağıdakilerden hangisidir?"
|
41 |
+
elif subset == "loan":
|
42 |
+
question = "Verilen kelimenin Türkçe kökenli eş anlamlısı aşağıdakilerden hangisidir?"
|
43 |
+
else:
|
44 |
+
question = "Verilen kelimenin eş anlamlısı aşağıdakilerden hangisidir?"
|
45 |
+
|
46 |
+
for row in curr_dataset:
|
47 |
+
total_count += 1
|
48 |
+
|
49 |
+
# Get values from row
|
50 |
+
category = "hard" if row["level"]== 1 else "easy" if row["level"] == 0 else None
|
51 |
+
answer_index = row["answer"]
|
52 |
+
correct_answer_letter = chr(65 + answer_index)
|
53 |
+
word = row["word"]
|
54 |
+
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
55 |
+
formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
# Prints for debugging
|
60 |
+
print(f"Difficulty: {category}")
|
61 |
+
print("Type of difficulty:", type(category))
|
62 |
+
print(f"Answer: {correct_answer_letter}")
|
63 |
+
print("Type of answer:", type(answer_index))
|
64 |
+
|
65 |
+
# Construct the prompt/message
|
66 |
+
instruction = ""
|
67 |
+
prompt = f"Soru: {question}\nKelime: {word}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
68 |
+
message = prompt
|
69 |
+
|
70 |
+
# Get/format answer of the model
|
71 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
72 |
+
responses.append(model_answer)
|
73 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
74 |
+
|
75 |
+
# Print answers
|
76 |
+
print(f"Correct Answer: {correct_answer_letter}")
|
77 |
+
print(f"Model Answer: {model_answer}")
|
78 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
79 |
+
print(f"Result: {correct_answer_letter == model_answer_cleaned}")
|
80 |
+
|
81 |
+
# Check if correct based on metric
|
82 |
+
if correct_answer_letter == model_answer_cleaned:
|
83 |
+
true += 1
|
84 |
+
difficulty_results[subset][category]['correct'] += 1
|
85 |
+
|
86 |
+
difficulty_results[subset][category]['total'] += 1
|
87 |
+
|
88 |
+
# Print results categorized by difficulty
|
89 |
+
for subset in self.subsets:
|
90 |
+
subset_results = difficulty_results[subset]
|
91 |
+
for category, stats in subset_results.items():
|
92 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
93 |
+
print(f"{subset.capitalize()} {category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
94 |
+
|
95 |
+
print("Results:", responses)
|
96 |
+
print("Overall Accuracy:", true / total_count)
|
97 |
+
acc = accuracy(true, total_count)
|
98 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
99 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
100 |
+
|