aacengiz commited on
Commit
ca54ffd
·
verified ·
1 Parent(s): b5edba5

Upload 9 files

Browse files
src/deepeval/bias.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import ast
6
+
7
+
8
+ class BiasTask(BaseTask):
9
+ def __init__(self, model_name):
10
+ super().__init__("metunlp/sosyoloji_bias", model_name=model_name)
11
+
12
+ def load_dataset_from_hf(self):
13
+ dataset = super().load_dataset_from_hf()
14
+ return dataset.select(range(min(1, len(dataset))))
15
+
16
+
17
+ def evaluate(self) -> dict[str, Any]:
18
+ responses = []
19
+ difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
20
+ total_count = 0
21
+ true = 0
22
+
23
+ for row in self.dataset:
24
+ total_count += 2
25
+
26
+ # Get values from row
27
+
28
+ ## common
29
+ choices = ast.literal_eval(row["choices"]) # Convert string to list
30
+ formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
31
+
32
+ ## ambiguous context
33
+ ambiguous_context = row["ambiguous_context"]
34
+ ambiguous_question = row["question_ambiguous"]
35
+ ambiguous_answer = row["answer_ambiguous"]
36
+ ambiguous_correct_answer_letter = chr(64 + ambiguous_answer) # 65 - 1 since we need the indexing to start from 0
37
+ ambiguous_prompt = f"Bağlam: {ambiguous_context}\nSoru: {ambiguous_question}\nSeçenekler:{formatted_choices}"
38
+
39
+ ## disambiguated context
40
+ disambiguated_context = row["disambiguated_context"]
41
+ disambiguated_question = row["question_disambiguated"]
42
+ disambiguated_answer = row["answer_disambiguated"]
43
+ disambiguated_correct_answer_letter = chr(64 + disambiguated_answer) # 65 - 1 since we need the indexing to start from 0
44
+ disambiguated_prompt = f"Bağlam: {disambiguated_context}\nSoru: {disambiguated_question}\nSeçenekler:{formatted_choices}"
45
+
46
+ # Prints for debugging
47
+ print(f"Choices: {choices}")
48
+ print("Type of choices:", type(choices))
49
+
50
+ # STAGE 1
51
+ instruction = ""
52
+ prompt = f"Stage1:\n{ambiguous_prompt}\n{instruction}\n"
53
+ message = prompt
54
+ ambiguous_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
55
+ ambiguous_model_answer_cleaned = ambiguous_model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
56
+ ## Print answers
57
+ print(f"Correct Answer: {ambiguous_correct_answer_letter}")
58
+ print(f"Model Answer: {ambiguous_model_answer}")
59
+ print(f"Model Answer Cleaned: {ambiguous_model_answer_cleaned}")
60
+ print(f"Result: {ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned}")
61
+ ## Check if correct based on metric
62
+ if ambiguous_correct_answer_letter == ambiguous_model_answer_cleaned:
63
+ true += 1
64
+ difficulty_results["ambiguous"]['correct'] += 1
65
+
66
+ difficulty_results["ambiguous"]['total'] += 1
67
+
68
+ # STAGE 2
69
+ instruction = ""
70
+ prompt = f"Stage2:\n{disambiguated_prompt}\n{instruction}\n"
71
+ message = prompt
72
+ disambiguated_model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
73
+ disambiguated_model_answer_cleaned = disambiguated_model_answer.strip().replace('\n', '').replace(' ','').upper().replace(':', '')
74
+ ## Print answers
75
+ print(f"Correct Answer: {disambiguated_correct_answer_letter}")
76
+ print(f"Model Answer: {disambiguated_model_answer}")
77
+ print(f"Model Answer Cleaned: {disambiguated_model_answer_cleaned}")
78
+ print(f"Result: {disambiguated_correct_answer_letter == disambiguated_model_answer_cleaned}")
79
+ responses.append((ambiguous_model_answer_cleaned,disambiguated_model_answer_cleaned))
80
+
81
+ ## Check if correct based on metric
82
+ if disambiguated_correct_answer_letter == disambiguated_model_answer_cleaned:
83
+ true += 1
84
+ difficulty_results["disambiguated"]['correct'] += 1
85
+
86
+ difficulty_results["disambiguated"]['total'] += 1
87
+
88
+ # Print results categorized by difficulty
89
+ for category, stats in difficulty_results.items():
90
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
91
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
92
+
93
+ print("Results:", responses)
94
+ print("Overall Accuracy:", true / total_count)
95
+ acc = accuracy(true, total_count)
96
+ acc_stderr = accuracy_standard_error(acc, total_count)
97
+ return {"acc": acc, "acc_stderr": acc_stderr}
98
+
src/deepeval/math.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import re
6
+
7
+ class MathTask(BaseTask):
8
+ def __init__(self, model_name):
9
+ super().__init__("metunlp/math_tr", model_name=model_name)
10
+
11
+ def load_dataset_from_hf(self):
12
+ dataset = super().load_dataset_from_hf()
13
+ return dataset.select(range(min(1, len(dataset))))
14
+
15
+ def generate_response_oeqa_multi_token(self, msg,max_new_tokens: int = 128):
16
+ """
17
+ Handles multiple-choice questions where answers might have multiple tokens.
18
+ """
19
+ # Ensure tokenizer has proper special tokens set
20
+ if self.tokenizer.pad_token is None:
21
+ self.tokenizer.pad_token = self.tokenizer.eos_token
22
+
23
+ if self.model.config.pad_token_id is None:
24
+ self.model.config.pad_token_id = self.tokenizer.pad_token_id
25
+
26
+ chat = [
27
+ {"role": "user", "content": "You are a question-answering chatbot."},
28
+ {"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
29
+ {"role": "user", "content": f"{msg}"},
30
+ ]
31
+ formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
32
+ print(formatted_chat)
33
+
34
+ inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
35
+ input_ids = inputs.input_ids.to(self.model.device)
36
+ attention_mask = inputs.attention_mask.to(self.model.device)
37
+
38
+ # Generate response with proper token limits
39
+ output = self.model.generate(
40
+ input_ids,
41
+ do_sample=True,
42
+ attention_mask=attention_mask,
43
+ eos_token_id=self.tokenizer.eos_token_id,
44
+ pad_token_id=self.tokenizer.pad_token_id,
45
+ temperature=0.4,
46
+ max_new_tokens=max_new_tokens,
47
+ )
48
+
49
+ generated_ids = output[0] # The generated sequence including the prompt
50
+ generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
51
+ generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
52
+
53
+ return generated_text
54
+
55
+
56
+ def evaluate(self) -> dict[str, Any]:
57
+ responses = []
58
+ difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
59
+ total_count = 0
60
+ true = 0
61
+
62
+ for row in self.dataset:
63
+ total_count += 1
64
+
65
+ # Get values from row
66
+ category = str(row["difficulty"])
67
+ answer = row["final_answer"]
68
+
69
+ # Prints for debugging
70
+ print(f"Answer: {answer}")
71
+ print("Type of answer:", type(answer))
72
+
73
+ # Construct the prompt/message
74
+ instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu (\\boxed{{}}) içinde verin.
75
+
76
+ Nihai Cevap için Uyulması Gereken Format Kuralları:
77
+
78
+ 1. Kesirler her zaman en sade hallerinde verilmeli.
79
+ - Matris içi kesirler: x/y biçiminde.
80
+ - Diğer tüm kesirler: \\frac{{x}}{{y}} biçiminde.
81
+ 2. Çarpma işareti (*) kullanılmamalı. Örnek: 2x yazın, 2**x* değil.
82
+ 3. Birden çok değişken varsa alfabetik sıraya uyulmalı ve (x, y, z...), polinomları azalan derece sırasına göre yazılmalı.
83
+ 4. Her zaman aynı gösterim biçimi kullanılmalı. Ondalık yerine kesir kullanılmalı (ör. 0.5 yerine \\frac{{1}}{{2}} ).
84
+ 5. Faktörize polinomlar daima aynı faktör sırası ile verilsin; her sorguda aynı cevabı verecek şekilde tutarlılığı koruyun.
85
+ 6. Nihai cevabı kutu dışında tekrar etmeyin, biçimi değiştirmeyin. Aynı soru tekrarlandığında aynı formatı ve cevabı verin.
86
+ 7. Nihai cevap, tek seferde \\boxed{{...}} içinde verilmeli. Örnek: Cevap x ise, "\\boxed{{x}}".
87
+
88
+
89
+ Görev: Problemi çözün, son adımda yukarıdaki kurallara tam uyan tek bir kutu içinde nihai cevabı verin.
90
+
91
+
92
+ Çözüm:
93
+
94
+
95
+ Nihai cevap:
96
+ """
97
+ prompt = f"{instruction}\n\nSoru:\n{row["question"]}\n"
98
+ message = prompt
99
+
100
+ # Get/format answer of the model
101
+ model_answer = self.generate_response_oeqa_multi_token(message)
102
+ responses.append(model_answer)
103
+ model_answer_cleaned = re.search(r"\\boxed{([^}]*)}", model_answer)
104
+
105
+ # Print answers
106
+ print(f"Correct Answer: {answer}")
107
+ print(f"Model Answer: {model_answer}")
108
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
109
+ print(f"Result: {answer == model_answer_cleaned}")
110
+
111
+ # Check if correct based on metric
112
+ if answer == model_answer_cleaned:
113
+ true += 1
114
+ difficulty_results[category]['correct'] += 1
115
+
116
+ difficulty_results[category]['total'] += 1
117
+
118
+ # Print results categorized by difficulty
119
+ for category, stats in difficulty_results.items():
120
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
121
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
122
+
123
+ print("Results:", responses)
124
+ print("Overall Accuracy:", true / total_count)
125
+ acc = accuracy(true, total_count)
126
+ acc_stderr = accuracy_standard_error(acc, total_count)
127
+ return {"acc": acc, "acc_stderr": acc_stderr}
128
+
src/deepeval/metaphors_and_idioms.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import os
6
+ import ast
7
+ import re
8
+ from datasets import load_dataset,get_dataset_split_names
9
+ HF_TOKEN=os.getenv("HF_TOKEN")
10
+
11
+ class MetaphorsAndIdiomsTask(BaseTask):
12
+ def __init__(self, model_name):
13
+ super().__init__("metunlp/metaphors_and_idioms", model_name=model_name)
14
+
15
+ def load_dataset_from_hf(self):
16
+ dataset = super().load_dataset_from_hf()
17
+ return dataset # dataset.select(range(min(10, len(dataset))))
18
+
19
+ def evaluate(self) -> dict[str, Any]:
20
+ responses = []
21
+ difficulty_results = defaultdict(lambda: defaultdict(lambda: {'correct': 0, 'total': 0}))
22
+
23
+ total_count = 0
24
+ true = 0
25
+
26
+
27
+ for row in self.dataset:
28
+ total_count += 1
29
+
30
+ # Get values from row
31
+ category = "hard" if row["level"]== 1 else "easy" if row["level"] == 0 else None
32
+ answer_index = row["answer"]
33
+ correct_answer_letter = chr(65 + answer_index)
34
+ context = row["context"]
35
+ choices = ast.literal_eval(row["choices"]) # Convert string to list
36
+ formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
37
+ subset = row["idiom_type"]
38
+
39
+ if subset == "atasözü":
40
+ question = "Aşağıda verilen durum hangi atasözü ile en iyi ifade edilebilir?"
41
+ elif subset == "deyim":
42
+ question = """Verilen bağlamda "[MASKED]" ile boş bırakılan yere hangi deyim getirilirse cümlenin akışı anlamlı olur?"""
43
+ else:
44
+ question = "Aşağıda verilen durum hangi atasözü ile en iyi ifade edilebilir?"
45
+
46
+ # Prints for debugging
47
+ print(f"Difficulty: {category}")
48
+ print("Type of difficulty:", type(category))
49
+ print(f"Answer: {correct_answer_letter}")
50
+ print("Type of answer:", type(answer_index))
51
+
52
+ # Construct the prompt/message
53
+ instruction = ""
54
+ prompt = f"Soru: {question}\nBağlam: {context}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
55
+ message = prompt
56
+
57
+ # Get/format answer of the model
58
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
59
+ responses.append(model_answer)
60
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
61
+
62
+ # Print answers
63
+ print(f"Correct Answer: {correct_answer_letter}")
64
+ print(f"Model Answer: {model_answer}")
65
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
66
+ print(f"Result: {correct_answer_letter == model_answer_cleaned}")
67
+
68
+ # Check if correct based on metric
69
+ if correct_answer_letter == model_answer_cleaned:
70
+ true += 1
71
+ difficulty_results[subset][category]['correct'] += 1
72
+
73
+ difficulty_results[subset][category]['total'] += 1
74
+
75
+ # Print results categorized by difficulty
76
+ for subset in difficulty_results.keys():
77
+ subset_results = difficulty_results[subset]
78
+ for category, stats in subset_results.items():
79
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
80
+ print(f"{subset.capitalize()} {category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
81
+
82
+ print("Results:", responses)
83
+ print("Overall Accuracy:", true / total_count)
84
+ acc = accuracy(true, total_count)
85
+ acc_stderr = accuracy_standard_error(acc, total_count)
86
+ return {"acc": acc, "acc_stderr": acc_stderr}
87
+
src/deepeval/mmlu.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import os
6
+ import ast
7
+ import re
8
+ from datasets import load_dataset,get_dataset_config_names
9
+ HF_TOKEN=os.getenv("HF_TOKEN")
10
+
11
+ class MMLUTask(BaseTask):
12
+ def __init__(self, model_name):
13
+ self.subsets = get_dataset_config_names("metunlp/mmlu_tr")
14
+ print(self.subsets)
15
+ super().__init__("metunlp/mmlu_tr", model_name=model_name)
16
+
17
+ def load_dataset_from_hf(self):
18
+ evaluate_count = 1
19
+ print("Loading dataset from Hugging Face.")
20
+ dataset_dict = {}
21
+ for subset in self.subsets:
22
+ subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
23
+ dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
24
+ print("Dataset loaded.")
25
+ return dataset_dict
26
+
27
+
28
+ def evaluate(self) -> dict[str, Any]:
29
+ responses = []
30
+ difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
31
+
32
+ total_count = 0
33
+ true = 0
34
+
35
+ for subset in self.subsets:
36
+ curr_dataset = self.dataset[subset]
37
+ print(curr_dataset[0])
38
+
39
+ for row in curr_dataset:
40
+ total_count += 1
41
+
42
+ # Get values from row
43
+ question = row["question"]
44
+ answer_index = row["answer"]
45
+ correct_answer_letter = chr(65 + answer_index)
46
+ choices = ast.literal_eval(row["choices"]) # Convert string to list
47
+ formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
48
+
49
+
50
+ # Prints for debugging
51
+ print(f"Answer: {correct_answer_letter}")
52
+ print("Type of answer:", type(answer_index))
53
+
54
+ # Construct the prompt/message
55
+ instruction = f"Aşağıda {row["subject"]} konusunda çoktan seçmeli bir soru verilmiştir."
56
+ prompt = f"{instruction}\n\nSoru: {question}\nSeçenekler:\n{formatted_choices}\n\n"
57
+ message = prompt
58
+
59
+ # Get/format answer of the model
60
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
61
+ responses.append(model_answer)
62
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
63
+
64
+ # Print answers
65
+ print(f"Correct Answer: {correct_answer_letter}")
66
+ print(f"Model Answer: {model_answer}")
67
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
68
+ print(f"Result: {correct_answer_letter == model_answer_cleaned}")
69
+
70
+ # Check if correct based on metric
71
+ if correct_answer_letter == model_answer_cleaned:
72
+ true += 1
73
+ difficulty_results[subset]['correct'] += 1
74
+
75
+ difficulty_results[subset]['total'] += 1
76
+
77
+ # Print results categorized by subset
78
+ for category, stats in difficulty_results.items():
79
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
80
+ print(f"{subset.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
81
+
82
+ print("Results:", responses)
83
+ print("Overall Accuracy:", true / total_count)
84
+ acc = accuracy(true, total_count)
85
+ acc_stderr = accuracy_standard_error(acc, total_count)
86
+ return {"acc": acc, "acc_stderr": acc_stderr}
87
+
src/deepeval/ner.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import re
6
+
7
+ class NERTask(BaseTask):
8
+ def __init__(self, model_name):
9
+ super().__init__("metunlp/tr_ner", model_name=model_name)
10
+
11
+ def load_dataset_from_hf(self):
12
+ dataset = super().load_dataset_from_hf()
13
+ return dataset.select(range(min(1, len(dataset))))
14
+
15
+ def generate_response_oeqa_multi_token(self, msg,max_new_tokens: int = 128):
16
+ """
17
+ Handles multiple-choice questions where answers might have multiple tokens.
18
+ """
19
+ # Ensure tokenizer has proper special tokens set
20
+ if self.tokenizer.pad_token is None:
21
+ self.tokenizer.pad_token = self.tokenizer.eos_token
22
+
23
+ if self.model.config.pad_token_id is None:
24
+ self.model.config.pad_token_id = self.tokenizer.pad_token_id
25
+
26
+ chat = [
27
+ {"role": "user", "content": "You are a question-answering chatbot."},
28
+ {"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
29
+ {"role": "user", "content": f"{msg}"},
30
+ ]
31
+ formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
32
+ print(formatted_chat)
33
+
34
+ inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
35
+ input_ids = inputs.input_ids.to(self.model.device)
36
+ attention_mask = inputs.attention_mask.to(self.model.device)
37
+
38
+
39
+ # Generate response with proper token limits
40
+ output = self.model.generate(
41
+ input_ids,
42
+ do_sample=True,
43
+ attention_mask=attention_mask,
44
+ eos_token_id=self.tokenizer.eos_token_id,
45
+ pad_token_id=self.tokenizer.pad_token_id,
46
+ temperature=0.4,
47
+ max_new_tokens=max_new_tokens,
48
+ )
49
+
50
+ generated_ids = output[0] # The generated sequence including the prompt
51
+ generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
52
+ generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
53
+
54
+ return generated_text
55
+
56
+
57
+ def evaluate(self) -> dict[str, Any]:
58
+ responses = []
59
+ difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
60
+ total_count = 0
61
+ true = 0
62
+
63
+ for row in self.dataset:
64
+ total_count += 1
65
+
66
+ # Get values from row
67
+ category = str(row["difficulty"])
68
+ answer = row["final_answer"]
69
+
70
+ # Prints for debugging
71
+ print(f"Answer: {answer}")
72
+ print("Type of answer:", type(answer))
73
+
74
+ # Construct the prompt/message
75
+ instruction = ("Aşağıdaki Named Entity Recognition (NER) için etiketlenmesi gereken cümleler vardır. "
76
+ "Cümlelerdeki varlıkları belirleyin ve şu kategorilere ayırın: CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PER, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, TITLE, WORK_OF_ART. "
77
+ ""
78
+ "Varlıklar, anlamlı bilgiler içeren terimlerdir ve aşağıdaki şekilde tanımlanır: "
79
+ "CARDINAL: Nicelik veya sıralama belirtmeyen sayısal ifadeler."
80
+ "DATE: Belirli bir tarih veya zaman ifadeleri."
81
+ "EVENT: Adlandırılmış olaylar veya durumlar."
82
+ "FAC: Binalar veya önemli yerler gibi tesisler."
83
+ "GPE: Ülke, şehir veya eyalet gibi coğrafi-politik varlıklar."
84
+ "LANGUAGE: Adlandırılmış diller."
85
+ "LAW: Yasal belgeler, düzenlemeler veya kanunlar."
86
+ "LOC: Coğrafi veya fiziksel konumlar (GPE dışındaki)."
87
+ "MONEY: Parasal değerler."
88
+ "NORP: Milletler, dini veya siyasi gruplar."
89
+ "ORDINAL: Sıralama veya dereceler."
90
+ "ORG: Organizasyonlar veya kurumlar."
91
+ "PER: Kişisel unvanlar veya sıfatlar."
92
+ "PERSON: Bireylerin isimleri."
93
+ "PRODUCT: Üretilen nesneler veya araçlar."
94
+ "QUANTITY: Ölçülebilir miktarlar ve birimler."
95
+ "TIME: Günün belirli saatleri."
96
+ "TITLE: Kişi unvanları."
97
+ "WORK_OF_ART: Sanat eserleri, kitaplar, müzik vb. Adlar, tarih ifadeleri, konumlar gibi belirgin bilgiler varlıktır."
98
+ ""
99
+ "Fiiller, sıfatlar, zarflar, soyut kavramlar gibi ifadeler varlık değildir. Çıktıyı aşağıdaki JSON formatında döndürün. "
100
+ ""
101
+ "Örnekler: "
102
+ "Girdi: "
103
+ "sentence: \"Üç yıl aradan sonra gerçekleştirilen ve Karadeniz, Ege ve Akdeniz’de düzenlenecek olan tatbikata ilişkin Yunanistan'ın Kathimerini gazetesi 'Türk-Yunan: Çetin donanma dengesinin gücü' başlığını kullandı.\""
104
+ "Çıktı: "
105
+ "Üç yıl,DATE"
106
+ "Karadeniz,LOC"
107
+ "Ege,LOC"
108
+ "Akdeniz,LOC"
109
+ "Yunanistan,GPE"
110
+ "Kathimerini,ORG"
111
+ "Türk,NORP"
112
+ ""
113
+ "Girdi:"
114
+ "sentence: \"Evlendikten sonra oyunculuğu bırakan Makal, geçen yıl eşi ve oğluyla beraber İstanbul’dan Göcek’e taşınmıştı."
115
+ "Çıktı: "
116
+ "Makal,PERSON"
117
+ "İstanbul,GPE"
118
+ "Göcek,GPE"
119
+ ""
120
+ "Girdi:"
121
+ "sentence: \"Yeşil-kırmızılılardan 2016’da ayrılıp 3 sezonluk aradan sonra 2019’da geri dönen Sarıca, takımına 2021 yılında Şampiyonlar Ligi’nde, 2023’te de Süper Lig’de iki final oynattı."
122
+ "Çıktı:"
123
+ "2016’da,DATE"
124
+ "3,CARDINAL"
125
+ "2019’da,DATE"
126
+ "Sarıca,PERSON"
127
+ "2021,DATE"
128
+ "Şampiyonlar Ligi’nde,EVENT"
129
+ "2023’te,DATE"
130
+ "Süper Lig’de,EVENT"
131
+ "iki,CARDINAL"
132
+ ""
133
+ "Verilen cümlelerdeki her varlığı csv formatında yukarıdaki örneklere benzer şekilde belirleyin. Çıktıdaki her satırı aşağıdaki gibi oluşturun: "
134
+ "<Varlık metni>,<Varlık etiketi>"),
135
+ prompt = f"{instruction}\n\nSoru:\n{row["question"]}\n"
136
+ message = prompt
137
+
138
+ # Get/format answer of the model
139
+ model_answer = self.generate_response_oeqa_multi_token(message)
140
+ responses.append(model_answer)
141
+ model_answer_cleaned = model_answer
142
+
143
+ # Print answers
144
+ print(f"Correct Answer: {answer}")
145
+ print(f"Model Answer: {model_answer}")
146
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
147
+ print(f"Result: {answer == model_answer_cleaned}")
148
+
149
+ # Check if correct based on metric
150
+ if answer == model_answer_cleaned:
151
+ true += 1
152
+ difficulty_results[category]['correct'] += 1
153
+
154
+ difficulty_results[category]['total'] += 1
155
+
156
+ # Print results categorized by difficulty
157
+ for category, stats in difficulty_results.items():
158
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
159
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
160
+
161
+ print("Results:", responses)
162
+ print("Overall Accuracy:", true / total_count)
163
+ acc = accuracy(true, total_count)
164
+ acc_stderr = accuracy_standard_error(acc, total_count)
165
+ return {"acc": acc, "acc_stderr": acc_stderr}
166
+
src/deepeval/pos.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import re
6
+
7
+ class POSTask(BaseTask):
8
+ def __init__(self, model_name):
9
+ super().__init__("metunlp/tr_pos", model_name=model_name)
10
+
11
+ def load_dataset_from_hf(self):
12
+ dataset = super().load_dataset_from_hf()
13
+ return dataset.select(range(min(1, len(dataset))))
14
+
15
+ def generate_response_oeqa_multi_token(self, msg,max_new_tokens: int = 128):
16
+ """
17
+ Handles multiple-choice questions where answers might have multiple tokens.
18
+ """
19
+ # Ensure tokenizer has proper special tokens set
20
+ if self.tokenizer.pad_token is None:
21
+ self.tokenizer.pad_token = self.tokenizer.eos_token
22
+
23
+ if self.model.config.pad_token_id is None:
24
+ self.model.config.pad_token_id = self.tokenizer.pad_token_id
25
+
26
+ chat = [
27
+ {"role": "user", "content": "You are a question-answering chatbot."},
28
+ {"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
29
+ {"role": "user", "content": f"{msg}"},
30
+ ]
31
+ formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
32
+ print(formatted_chat)
33
+
34
+ inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
35
+ input_ids = inputs.input_ids.to(self.model.device)
36
+ attention_mask = inputs.attention_mask.to(self.model.device)
37
+ prompt = ("Aşağıdaki Named Entity Recognition (NER) için etiketlenmesi gereken cümleler vardır. "
38
+ "Cümlelerdeki varlıkları belirleyin ve şu kategorilere ayırın: CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PER, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, TITLE, WORK_OF_ART. "
39
+ ""
40
+ "Varlıklar, anlamlı bilgiler içeren terimlerdir ve aşağıdaki şekilde tanımlanır: "
41
+ "CARDINAL: Nicelik veya sıralama belirtmeyen sayısal ifadeler."
42
+ "DATE: Belirli bir tarih veya zaman ifadeleri."
43
+ "EVENT: Adlandırılmış olaylar veya durumlar."
44
+ "FAC: Binalar veya önemli yerler gibi tesisler."
45
+ "GPE: Ülke, şehir veya eyalet gibi coğrafi-politik varlıklar."
46
+ "LANGUAGE: Adlandırılmış diller."
47
+ "LAW: Yasal belgeler, düzenlemeler veya kanunlar."
48
+ "LOC: Coğrafi veya fiziksel konumlar (GPE dışındaki)."
49
+ "MONEY: Parasal değerler."
50
+ "NORP: Milletler, dini veya siyasi gruplar."
51
+ "ORDINAL: Sıralama veya dereceler."
52
+ "ORG: Organizasyonlar veya kurumlar."
53
+ "PER: Kişisel unvanlar veya sıfatlar."
54
+ "PERSON: Bireylerin isimleri."
55
+ "PRODUCT: Üretilen nesneler veya araçlar."
56
+ "QUANTITY: Ölçülebilir miktarlar ve birimler."
57
+ "TIME: Günün belirli saatleri."
58
+ "TITLE: Kişi unvanları."
59
+ "WORK_OF_ART: Sanat eserleri, kitaplar, müzik vb. Adlar, tarih ifadeleri, konumlar gibi belirgin bilgiler varlıktır."
60
+ ""
61
+ "Fiiller, sıfatlar, zarflar, soyut kavramlar gibi ifadeler varlık değildir. Çıktıyı aşağıdaki JSON formatında döndürün. "
62
+ ""
63
+ "Örnekler: "
64
+ "Girdi: "
65
+ "\"sentence\": \"Üç yıl aradan sonra gerçekleştirilen ve Karadeniz, Ege ve Akdeniz’de düzenlenecek olan tatbikata ilişkin Yunanistan'ın Kathimerini gazetesi 'Türk-Yunan: Çetin donanma dengesinin gücü' başlığını kullandı.\""
66
+ "Çıktı: "
67
+ "Üç yıl: DATE\" }, { \"text\": \"Karadeniz\", \"label\": \"LOC\" }, { \"text\": \"Ege\", \"label\": \"LOC\" }, { \"text\": \"Akdeniz\", \"label\": \"LOC\" }, { \"text\": \"Yunanistan\", \"label\": \"GPE\" }, { \"text\": \"Kathimerini\", \"label\": \"ORG\" }, { \"text\": \"Türk\", \"label\": \"NORP\" }]} Girdi: {\"sentence\": \"Evlendikten sonra oyunculuğu bırakan Makal, geçen yıl eşi ve oğluyla beraber İstanbul’dan Göcek’e taşınmıştı.\"} Çıktı: {\"entities\": [{ \"text\": \"Makal\", \"label\": \"PERSON\" }, { \"text\": \"İstanbul\", \"label\": \"GPE\" }, { \"text\": \"Göcek\", \"label\": \"GPE\" }]} Girdi: {\"sentence\": \"Yeşil-kırmızılılardan 2016’da ayrılıp 3 sezonluk aradan sonra 2019’da geri dönen Sarıca, takımına 2021 yılında Şampiyonlar Ligi’nde, 2023’te de Süper Lig’de iki final oynattı.\"} Çıktı: {\"entities\": [{ \"text\": \"2016’da\", \"label\": \"DATE\" }, { \"text\": \"3\", \"label\": \"CARDINAL\" }, { \"text\": \"2019’da\", \"label\": \"DATE\" }, { \"text\": \"Sarıca\", \"label\": \"PERSON\" }, { \"text\": \"2021\", \"label\": \"DATE\" }, { \"text\": \"Şampiyonlar Ligi’nde\", \"label\": \"EVENT\" }, { \"text\": \"2023’te\", \"label\": \"DATE\" }, { \"text\": \"Süper Lig’de\", \"label\": \"EVENT\" }, { \"text\": \"iki\", \"label\": \"CARDINAL\" }]}. Verilen cümlelerdeki varlıkları JSON formatında yukarıdaki örneklere benzer şekilde belirleyin. Çıktıyı aşağıdaki gibi oluşturun: Girdi Formatı: {\"sentence\": \"<CÜMLE>\"} Çıktı Formatı: {\"entities\": [{ \"text\": \"<Varlık metni>\", \"label\": \"<Varlık etiketi>\" }]}"),
68
+
69
+ # Generate response with proper token limits
70
+ output = self.model.generate(
71
+ input_ids,
72
+ do_sample=True,
73
+ attention_mask=attention_mask,
74
+ eos_token_id=self.tokenizer.eos_token_id,
75
+ pad_token_id=self.tokenizer.pad_token_id,
76
+ temperature=0.4,
77
+ max_new_tokens=max_new_tokens,
78
+ )
79
+
80
+ generated_ids = output[0] # The generated sequence including the prompt
81
+ generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
82
+ generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
83
+
84
+ return generated_text
85
+
86
+
87
+ def evaluate(self) -> dict[str, Any]:
88
+ responses = []
89
+ difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
90
+ total_count = 0
91
+ true = 0
92
+
93
+ for row in self.dataset:
94
+ total_count += 1
95
+
96
+ # Get values from row
97
+ category = str(row["difficulty"])
98
+ answer = row["final_answer"]
99
+
100
+ # Prints for debugging
101
+ print(f"Answer: {answer}")
102
+ print("Type of answer:", type(answer))
103
+
104
+ # Construct the prompt/message
105
+ instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu (\\boxed{{}}) içinde verin.
106
+
107
+ Nihai Cevap için Uyulması Gereken Format Kuralları:
108
+
109
+ 1. Kesirler her zaman en sade hallerinde verilmeli.
110
+ - Matris içi kesirler: x/y biçiminde.
111
+ - Diğer tüm kesirler: \\frac{{x}}{{y}} biçiminde.
112
+ 2. Çarpma işareti (*) kullanılmamalı. Örnek: 2x yazın, 2**x* değil.
113
+ 3. Birden çok değişken varsa alfabetik sıraya uyulmalı ve (x, y, z...), polinomları azalan derece sırasına göre yazılmalı.
114
+ 4. Her zaman aynı gösterim biçimi kullanılmalı. Ondalık yerine kesir kullanılmalı (ör. 0.5 yerine \\frac{{1}}{{2}} ).
115
+ 5. Faktörize polinomlar daima aynı faktör sırası ile verilsin; her sorguda aynı cevabı verecek şekilde tutarlılığı koruyun.
116
+ 6. Nihai cevabı kutu dışında tekrar etmeyin, biçimi değiştirmeyin. Aynı soru tekrarlandığında aynı formatı ve cevabı verin.
117
+ 7. Nihai cevap, tek seferde \\boxed{{...}} içinde verilmeli. Örnek: Cevap x ise, "\\boxed{{x}}".
118
+
119
+
120
+ Görev: Problemi çözün, son adımda yukarıdaki kurallara tam uyan tek bir kutu içinde nihai cevabı verin.
121
+
122
+
123
+ Çözüm:
124
+
125
+
126
+ Nihai cevap:
127
+ """
128
+ prompt = f"{instruction}\n\nSoru:\n{row["question"]}\n"
129
+ message = prompt
130
+
131
+ # Get/format answer of the model
132
+ model_answer = self.generate_response_oeqa_multi_token(message)
133
+ responses.append(model_answer)
134
+ model_answer_cleaned = re.search(r"\\boxed{([^}]*)}", model_answer)
135
+
136
+ # Print answers
137
+ print(f"Correct Answer: {answer}")
138
+ print(f"Model Answer: {model_answer}")
139
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
140
+ print(f"Result: {answer == model_answer_cleaned}")
141
+
142
+ # Check if correct based on metric
143
+ if answer == model_answer_cleaned:
144
+ true += 1
145
+ difficulty_results[category]['correct'] += 1
146
+
147
+ difficulty_results[category]['total'] += 1
148
+
149
+ # Print results categorized by difficulty
150
+ for category, stats in difficulty_results.items():
151
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
152
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
153
+
154
+ print("Results:", responses)
155
+ print("Overall Accuracy:", true / total_count)
156
+ acc = accuracy(true, total_count)
157
+ acc_stderr = accuracy_standard_error(acc, total_count)
158
+ return {"acc": acc, "acc_stderr": acc_stderr}
159
+
src/deepeval/sts.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import re
6
+ from datasets import load_dataset
7
+ import os
8
+ from dotenv import load_dotenv
9
+ import openai
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList, LogitsProcessor
11
+ import torch
12
+ from typing import List
13
+
14
+ class STSTask(BaseTask):
15
+ def __init__(self, model_name):
16
+ super().__init__("metunlp/sts_tr", model_name=model_name)
17
+
18
+ def load_dataset_from_hf(self):
19
+ dataset = super().load_dataset_from_hf()
20
+ return dataset.select(range(min(1, len(dataset))))
21
+
22
+ def generate_response_sts_multi_token(self, msg, max_new_tokens=5, choices: list = []):
23
+ """
24
+ Handles multiple-choice questions where answers might have multiple tokens.
25
+ """
26
+ # Ensure tokenizer has proper special tokens set
27
+ if self.tokenizer.pad_token is None:
28
+ self.tokenizer.pad_token = self.tokenizer.eos_token
29
+
30
+ if self.model.config.pad_token_id is None:
31
+ self.model.config.pad_token_id = self.tokenizer.pad_token_id
32
+
33
+ chat = [
34
+ {"role": "user",
35
+ "content": "You are a sentence similarity scoring chatbot. Only respond with one of the given scores: 0, 1, 2, 3, 4, or 5."},
36
+ {"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
37
+ {"role": "user", "content": f"{msg}"},
38
+ ]
39
+ formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
40
+ print(formatted_chat)
41
+ inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
42
+ input_ids = inputs.input_ids.to(self.model.device)
43
+ attention_mask = inputs.attention_mask.to(self.model.device)
44
+
45
+ # Generate the sequence of letters starting from 'A'
46
+ letters = ["0","1","2","3","4","5"]
47
+ encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
48
+ flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list
49
+ print(flattened_encoded_choices)
50
+
51
+ allowed_tokens = flattened_encoded_choices
52
+ allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
53
+ allowed_token_ids = set(allowed_tokens) # Ensure uniqueness
54
+
55
+ # Custom LogitsProcessor to restrict generation
56
+ class RestrictToABCDLogitsProcessor(LogitsProcessor):
57
+ def __call__(self, input_ids, scores):
58
+ mask = torch.full_like(scores, float("-inf")) # Block all tokens
59
+ mask[:, list(allowed_token_ids)] = scores[:, list(allowed_token_ids)] # Allow only A, B, C, D tokens
60
+ return mask
61
+
62
+ logits_processor = LogitsProcessorList([RestrictToABCDLogitsProcessor()])
63
+
64
+ # Generate response
65
+ output = self.model.generate(
66
+ input_ids,
67
+ do_sample=True,
68
+ attention_mask=attention_mask,
69
+ max_new_tokens=max_new_tokens,
70
+ eos_token_id=self.tokenizer.eos_token_id,
71
+ pad_token_id=self.tokenizer.pad_token_id,
72
+ temperature=0.4,
73
+ logits_processor=logits_processor,
74
+ )
75
+ generated_ids = output[0] # The generated sequence including the prompt
76
+ generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
77
+ generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
78
+ return generated_text
79
+
80
+ def evaluate(self) -> dict[str, Any]:
81
+ responses = []
82
+ difficulty_results = {'correct': 0, 'total': 0}
83
+
84
+ total_count = 0
85
+ true = 0
86
+
87
+ for row in self.dataset:
88
+ total_count += 1
89
+
90
+ # Get values from row
91
+ answer = row["score"]
92
+ choices = ["0","1","2","3","4","5"]
93
+
94
+ # Prints for debugging
95
+ print(f"Answer: {answer}")
96
+ print("Type of answer:", type(answer))
97
+
98
+ # Construct the prompt/message
99
+ instruction = f"Aşağıda verilen iki cümlenin birbirlerine olan anlamsal benzerliğini 0'dan 5'e kadar olan bir tam sayıyla söyleyin."
100
+ prompt = f"""{instruction}\nCümle 1: {row["sentence_1"]}\nCümle 2: {row["sentence_2"]}\nSadece tek bir tam sayı söyleyin, ek bir kelime ya da sembol kullanmayın."""
101
+ message = prompt
102
+
103
+ # Get/format answer of the model
104
+ model_answer = self.generate_response_sts_multi_token(message, max_new_tokens=2)
105
+ responses.append(model_answer)
106
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
107
+
108
+ # Print answers
109
+ print(f"Correct Answer: {answer}")
110
+ print(f"Model Answer: {model_answer}")
111
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
112
+ print(f"Result: {answer == model_answer_cleaned}")
113
+
114
+ # Check if correct based on metric
115
+ if answer == model_answer_cleaned:
116
+ true += 1
117
+ difficulty_results['correct'] += 1
118
+
119
+ difficulty_results['total'] += 1
120
+
121
+ # Print results
122
+ stats = difficulty_results
123
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
124
+ print(f"Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
125
+
126
+ print("Results:", responses)
127
+ print("Overall Accuracy:", true / total_count)
128
+ acc = accuracy(true, total_count)
129
+ acc_stderr = accuracy_standard_error(acc, total_count)
130
+ return {"acc": acc, "acc_stderr": acc_stderr}
131
+
src/deepeval/topic_detection.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import ast
6
+
7
+
8
+ class TopicDetectionTask(BaseTask):
9
+ def __init__(self, model_name):
10
+ super().__init__("metunlp/topic_detection_tr", model_name=model_name)
11
+
12
+ def load_dataset_from_hf(self):
13
+ dataset = super().load_dataset_from_hf()
14
+ return dataset.select(range(min(10, len(dataset))))
15
+
16
+
17
+ def evaluate(self) -> dict[str, Any]:
18
+ responses = []
19
+ difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
20
+ total_count = 0
21
+ true = 0
22
+
23
+ for row in self.dataset:
24
+ total_count += 1
25
+
26
+ # Get values from row
27
+ choices = ast.literal_eval(row["choices"]) # Convert string to list
28
+ formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
29
+ category = row["level"].lower().replace(' ','')
30
+ answer = row["answer"]
31
+ text = row["text"]
32
+
33
+ # Prints for debugging
34
+ print(f"Choices: {choices}")
35
+ print("Type of choices:", type(choices))
36
+ print("Type of answer:", type(answer))
37
+
38
+ # Get answer index (starting from 0)
39
+ if type(answer) == int:
40
+ answer_index = answer
41
+ else:
42
+ answer_index = int(answer)
43
+ correct_answer_letter = chr(65 + answer_index)
44
+
45
+
46
+ # Construct the prompt/message
47
+ instruction = "Aşağıdaki metni analiz et ve seçeneklerden bu metnin en olası kategorisini belirle. Temaya ve detaylara dikkat ederek metnin ana fikrini göz önünde bulundurarak soruyu cevapla."
48
+ prompt = f"{instruction}\n\nMetin:\n{text}\nSeçenekler:\n{formatted_choices}\n\n"
49
+ message = prompt
50
+
51
+ # Get/format answer of the model
52
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
53
+ responses.append(model_answer)
54
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
55
+
56
+ # Print answers
57
+ print(f"Correct Answer: {correct_answer_letter}")
58
+ print(f"Model Answer: {model_answer}")
59
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
60
+ print(f"Result: {correct_answer_letter == model_answer_cleaned}")
61
+
62
+ # Check if correct based on metric
63
+ if correct_answer_letter == model_answer_cleaned:
64
+ true += 1
65
+ difficulty_results[category]['correct'] += 1
66
+
67
+ difficulty_results[category]['total'] += 1
68
+
69
+ # Print results categorized by difficulty
70
+ for category, stats in difficulty_results.items():
71
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
72
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
73
+
74
+ print("Results:", responses)
75
+ print("Overall Accuracy:", true / total_count)
76
+ acc = accuracy(true, total_count)
77
+ acc_stderr = accuracy_standard_error(acc, total_count)
78
+ return {"acc": acc, "acc_stderr": acc_stderr}
79
+
src/deepeval/turkish_vocabulary.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import os
6
+ import ast
7
+ import re
8
+ from datasets import load_dataset,get_dataset_split_names
9
+ HF_TOKEN=os.getenv("HF_TOKEN")
10
+
11
+ class TurkishVocabularyTask(BaseTask):
12
+ def __init__(self, model_name):
13
+ self.subsets = ["rare", "loan"]
14
+ super().__init__("metunlp/turkish_vocabulary", model_name=model_name)
15
+
16
+ def load_dataset_from_hf(self):
17
+ evaluate_count = 1
18
+ print("Loading dataset from Hugging Face.")
19
+ dataset_dict = {}
20
+ for subset in self.subsets:
21
+ subset_data = load_dataset(self.dataset_repo, subset, token=HF_TOKEN, split="train")
22
+ dataset_dict[subset] = subset_data.select(range(min(evaluate_count, len(subset_data))))
23
+ print("Dataset loaded.")
24
+ return dataset_dict
25
+
26
+
27
+ def evaluate(self) -> dict[str, Any]:
28
+ responses = []
29
+ difficulty_results = defaultdict(lambda: defaultdict(lambda: {'correct': 0, 'total': 0}))
30
+
31
+ total_count = 0
32
+ true = 0
33
+
34
+ for subset in self.subsets:
35
+ curr_dataset = self.dataset[subset]
36
+ print(curr_dataset[0])
37
+
38
+ # Determine the question based on the subset
39
+ if subset == "rare":
40
+ question = "Verilen kelimenin eş anlamlısı aşağıdakilerden hangisidir?"
41
+ elif subset == "loan":
42
+ question = "Verilen kelimenin Türkçe kökenli eş anlamlısı aşağıdakilerden hangisidir?"
43
+ else:
44
+ question = "Verilen kelimenin eş anlamlısı aşağıdakilerden hangisidir?"
45
+
46
+ for row in curr_dataset:
47
+ total_count += 1
48
+
49
+ # Get values from row
50
+ category = "hard" if row["level"]== 1 else "easy" if row["level"] == 0 else None
51
+ answer_index = row["answer"]
52
+ correct_answer_letter = chr(65 + answer_index)
53
+ word = row["word"]
54
+ choices = ast.literal_eval(row["choices"]) # Convert string to list
55
+ formatted_choices = "\n".join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(choices)])
56
+
57
+
58
+
59
+ # Prints for debugging
60
+ print(f"Difficulty: {category}")
61
+ print("Type of difficulty:", type(category))
62
+ print(f"Answer: {correct_answer_letter}")
63
+ print("Type of answer:", type(answer_index))
64
+
65
+ # Construct the prompt/message
66
+ instruction = ""
67
+ prompt = f"Soru: {question}\nKelime: {word}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
68
+ message = prompt
69
+
70
+ # Get/format answer of the model
71
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
72
+ responses.append(model_answer)
73
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
74
+
75
+ # Print answers
76
+ print(f"Correct Answer: {correct_answer_letter}")
77
+ print(f"Model Answer: {model_answer}")
78
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
79
+ print(f"Result: {correct_answer_letter == model_answer_cleaned}")
80
+
81
+ # Check if correct based on metric
82
+ if correct_answer_letter == model_answer_cleaned:
83
+ true += 1
84
+ difficulty_results[subset][category]['correct'] += 1
85
+
86
+ difficulty_results[subset][category]['total'] += 1
87
+
88
+ # Print results categorized by difficulty
89
+ for subset in self.subsets:
90
+ subset_results = difficulty_results[subset]
91
+ for category, stats in subset_results.items():
92
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
93
+ print(f"{subset.capitalize()} {category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
94
+
95
+ print("Results:", responses)
96
+ print("Overall Accuracy:", true / total_count)
97
+ acc = accuracy(true, total_count)
98
+ acc_stderr = accuracy_standard_error(acc, total_count)
99
+ return {"acc": acc, "acc_stderr": acc_stderr}
100
+