Spaces:
Paused
Paused
Add changes to files
Browse files
src/deepeval/commonsense_reasoning_task.py
CHANGED
@@ -10,7 +10,7 @@ class CommonsenseReasoningTask(BaseTask):
|
|
10 |
|
11 |
def load_dataset_from_hf(self):
|
12 |
dataset = super().load_dataset_from_hf()
|
13 |
-
return dataset.select(range(min(
|
14 |
|
15 |
|
16 |
def evaluate(self) -> dict[str, Any]:
|
|
|
10 |
|
11 |
def load_dataset_from_hf(self):
|
12 |
dataset = super().load_dataset_from_hf()
|
13 |
+
return dataset.select(range(min(10, len(dataset))))
|
14 |
|
15 |
|
16 |
def evaluate(self) -> dict[str, Any]:
|
src/deepeval/complex_reasoning.py
CHANGED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import ast
|
6 |
+
|
7 |
+
|
8 |
+
class ComplexReasoningTask(BaseTask):
|
9 |
+
def __init__(self, model_name):
|
10 |
+
super().__init__("metunlp/complex-ales", model_name=model_name)
|
11 |
+
|
12 |
+
def load_dataset_from_hf(self):
|
13 |
+
dataset = super().load_dataset_from_hf()
|
14 |
+
return dataset.select(range(min(10, len(dataset))))
|
15 |
+
|
16 |
+
|
17 |
+
def evaluate(self) -> dict[str, Any]:
|
18 |
+
responses = []
|
19 |
+
correct_answers = []
|
20 |
+
|
21 |
+
total_count = 0
|
22 |
+
true = 0
|
23 |
+
|
24 |
+
for row in self.dataset:
|
25 |
+
total_count += 1
|
26 |
+
|
27 |
+
# Get values from row
|
28 |
+
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
29 |
+
formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
|
30 |
+
correct_answer_letter = row["answer_choice"]
|
31 |
+
correct_answers.append(correct_answer_letter)
|
32 |
+
|
33 |
+
# Prints for debugging
|
34 |
+
print(f"Choices: {choices}")
|
35 |
+
print("Type of choices:", type(choices))
|
36 |
+
|
37 |
+
|
38 |
+
# Construct the prompt/message
|
39 |
+
instruction = ""
|
40 |
+
prompt = f"Soru:\n{row["narrative"]}\n{row["question"]}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
41 |
+
message = prompt
|
42 |
+
|
43 |
+
# Get/format answer of the model
|
44 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
45 |
+
responses.append(model_answer)
|
46 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
47 |
+
|
48 |
+
if correct_answer_letter == model_answer_cleaned:
|
49 |
+
true += 1
|
50 |
+
# Print answers
|
51 |
+
print(f"Correct Answer: {correct_answer_letter}")
|
52 |
+
print(f"Model Answer: {model_answer}")
|
53 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
54 |
+
|
55 |
+
print("Answers:", correct_answers)
|
56 |
+
print("Results:", responses)
|
57 |
+
print("Overall Accuracy:", true / total_count)
|
58 |
+
acc = accuracy(true, total_count)
|
59 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
60 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
61 |
+
|
src/deepeval/deepeval_task_manager.py
CHANGED
@@ -10,6 +10,9 @@ from src.deepeval.toxicity_task import ToxicityTask
|
|
10 |
from src.deepeval.bias_task import BiasTask
|
11 |
from src.deepeval.instruction_following_task import InstructionFollowingTask
|
12 |
from src.deepeval.reading_comprehension_task import ReadingComprehensionTask
|
|
|
|
|
|
|
13 |
from typing import List
|
14 |
load_dotenv()
|
15 |
|
@@ -31,6 +34,9 @@ class Task(Enum):
|
|
31 |
INSTRUCTION_FOLLOWING = "instruction_following_tr"
|
32 |
READING_COMPREHENSION = "reading_comprehension_tr"
|
33 |
COMMONSENSE_REASONING = "commonsense_reasoning"
|
|
|
|
|
|
|
34 |
|
35 |
|
36 |
class DeepEvalTaskManager:
|
@@ -71,11 +77,6 @@ class DeepEvalTaskManager:
|
|
71 |
res = turkish_general_knowledge_task.evaluate()
|
72 |
return res
|
73 |
|
74 |
-
def commonsense_reasoning(self):
|
75 |
-
commonsense_reasoning_task = CommonsenseReasoningTask(self.model_name)
|
76 |
-
res = commonsense_reasoning_task.evaluate()
|
77 |
-
return res
|
78 |
-
|
79 |
def summarization_tr(self):
|
80 |
task = SummarizationTask(self.model_name)
|
81 |
return task.evaluate()
|
@@ -100,7 +101,27 @@ class DeepEvalTaskManager:
|
|
100 |
task = ReadingComprehensionTask(self.model_name)
|
101 |
return task.evaluate()
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
if __name__ == "__main__":
|
104 |
-
des = DeepEvalTaskManager("meta-llama/Llama-3.2-1B-Instruct", ["
|
105 |
res = des.run_tasks()
|
106 |
print(res)
|
|
|
10 |
from src.deepeval.bias_task import BiasTask
|
11 |
from src.deepeval.instruction_following_task import InstructionFollowingTask
|
12 |
from src.deepeval.reading_comprehension_task import ReadingComprehensionTask
|
13 |
+
from src.deepeval.reading_comp_mc import ReadingComprehensionMCTask
|
14 |
+
from src.deepeval.complex_reasoning import ComplexReasoningTask
|
15 |
+
from src.deepeval.nli import NLITask
|
16 |
from typing import List
|
17 |
load_dotenv()
|
18 |
|
|
|
34 |
INSTRUCTION_FOLLOWING = "instruction_following_tr"
|
35 |
READING_COMPREHENSION = "reading_comprehension_tr"
|
36 |
COMMONSENSE_REASONING = "commonsense_reasoning"
|
37 |
+
READING_COMPREHENSION_MC = "reading_comprehension_mc"
|
38 |
+
COMPLEX_REASONING = "complex_reasoning"
|
39 |
+
NLI = "nli"
|
40 |
|
41 |
|
42 |
class DeepEvalTaskManager:
|
|
|
77 |
res = turkish_general_knowledge_task.evaluate()
|
78 |
return res
|
79 |
|
|
|
|
|
|
|
|
|
|
|
80 |
def summarization_tr(self):
|
81 |
task = SummarizationTask(self.model_name)
|
82 |
return task.evaluate()
|
|
|
101 |
task = ReadingComprehensionTask(self.model_name)
|
102 |
return task.evaluate()
|
103 |
|
104 |
+
def commonsense_reasoning(self):
|
105 |
+
commonsense_reasoning_task = CommonsenseReasoningTask(self.model_name)
|
106 |
+
res = commonsense_reasoning_task.evaluate()
|
107 |
+
return res
|
108 |
+
|
109 |
+
def reading_comprehension_mc(self):
|
110 |
+
reading_comprehension_mc_task = ReadingComprehensionMCTask(self.model_name)
|
111 |
+
res = reading_comprehension_mc_task.evaluate()
|
112 |
+
return res
|
113 |
+
|
114 |
+
def complex_reasoning(self):
|
115 |
+
complex_reasoning_task = ComplexReasoningTask(self.model_name)
|
116 |
+
res = complex_reasoning_task.evaluate()
|
117 |
+
return res
|
118 |
+
|
119 |
+
def nli(self):
|
120 |
+
nli_task = NLITask(self.model_name)
|
121 |
+
res = nli_task.evaluate()
|
122 |
+
return res
|
123 |
+
|
124 |
if __name__ == "__main__":
|
125 |
+
des = DeepEvalTaskManager("meta-llama/Llama-3.2-1B-Instruct", ["COMPLEX_REASONING","NLI"])
|
126 |
res = des.run_tasks()
|
127 |
print(res)
|
src/deepeval/nli.py
CHANGED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
|
6 |
+
|
7 |
+
class NLITask(BaseTask):
|
8 |
+
def __init__(self, model_name):
|
9 |
+
super().__init__("metunlp/nli_tr", model_name=model_name)
|
10 |
+
|
11 |
+
def load_dataset_from_hf(self):
|
12 |
+
dataset = super().load_dataset_from_hf()
|
13 |
+
return dataset.select(range(min(10, len(dataset))))
|
14 |
+
|
15 |
+
|
16 |
+
def evaluate(self) -> dict[str, Any]:
|
17 |
+
responses = []
|
18 |
+
difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
|
19 |
+
total_count = 0
|
20 |
+
true = 0
|
21 |
+
|
22 |
+
for row in self.dataset:
|
23 |
+
total_count += 1
|
24 |
+
|
25 |
+
# Get values from row
|
26 |
+
label = row["label"].lower().replace(' ','')
|
27 |
+
choices=["entailment","contradiction","neutral"]
|
28 |
+
formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
|
29 |
+
category = row["difficulty"]
|
30 |
+
correct_answer_letter = "A" if label == "entailment" else \
|
31 |
+
"B" if label == "contradiction" else \
|
32 |
+
"C" if label == "neutral" else None
|
33 |
+
|
34 |
+
|
35 |
+
# Prints for debugging
|
36 |
+
print(f"Choices: {choices}")
|
37 |
+
print("Type of choices:", type(choices))
|
38 |
+
print("Label:", label)
|
39 |
+
|
40 |
+
# Construct the prompt/message
|
41 |
+
instruction = ""
|
42 |
+
question = "Yukarıdaki cümleler arasındaki ilişki “entailment” (bir cümle diğerini ima eder), “neutral (cümleler birbirini ima etmez ve çelişmez) veya “contradiction (cümleler birbirleriyle çelişir) olarak karakterize edilebilir. Bu ilişkilerden hangisi olduğunu söyleyin."
|
43 |
+
context = f"Bağlam:\n{row["text"]}\n" # can add to prompt if needed
|
44 |
+
prompt = f"Cümle1:\n{row["premise"]}\nCümle2:{row["hypothesis"]}\nSoru:\n{question}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
45 |
+
message = prompt
|
46 |
+
|
47 |
+
# Get/format answer of the model
|
48 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=10)
|
49 |
+
responses.append(model_answer)
|
50 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
|
51 |
+
|
52 |
+
# Print answers
|
53 |
+
print(f"Correct Answer: {correct_answer_letter}")
|
54 |
+
print(f"Model Answer: {model_answer}")
|
55 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
56 |
+
|
57 |
+
# Check if correct based on metric
|
58 |
+
if correct_answer_letter == model_answer_cleaned:
|
59 |
+
true += 1
|
60 |
+
difficulty_results[category]['correct'] += 1
|
61 |
+
|
62 |
+
difficulty_results[category]['total'] += 1
|
63 |
+
|
64 |
+
# Print results categorized by difficulty
|
65 |
+
for category, stats in difficulty_results.items():
|
66 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
67 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
68 |
+
|
69 |
+
print("Results:", responses)
|
70 |
+
print("Overall Accuracy:", true / total_count)
|
71 |
+
acc = accuracy(true, total_count)
|
72 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
73 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
74 |
+
|
src/deepeval/reading_comp_mc.py
CHANGED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import ast
|
6 |
+
|
7 |
+
|
8 |
+
class ReadingComprehensionMCTask(BaseTask):
|
9 |
+
def __init__(self, model_name):
|
10 |
+
super().__init__("metunlp/reading_comp_mc", model_name=model_name)
|
11 |
+
|
12 |
+
def load_dataset_from_hf(self):
|
13 |
+
dataset = super().load_dataset_from_hf()
|
14 |
+
return dataset.select(range(min(10, len(dataset))))
|
15 |
+
|
16 |
+
|
17 |
+
def evaluate(self) -> dict[str, Any]:
|
18 |
+
responses = []
|
19 |
+
difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
|
20 |
+
total_count = 0
|
21 |
+
true = 0
|
22 |
+
|
23 |
+
for row in self.dataset:
|
24 |
+
total_count += 1
|
25 |
+
|
26 |
+
# Get values from row
|
27 |
+
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
28 |
+
formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
|
29 |
+
category = row["difficulty"].lower().replace(' ','')
|
30 |
+
answer = row["answer"]
|
31 |
+
|
32 |
+
# Prints for debugging
|
33 |
+
print(f"Choices: {choices}")
|
34 |
+
print("Type of choices:", type(choices))
|
35 |
+
print("Type of answer:", type(answer))
|
36 |
+
|
37 |
+
# Get answer index (starting from 0)
|
38 |
+
if type(answer) == int:
|
39 |
+
answer_index = answer
|
40 |
+
else:
|
41 |
+
answer_index = int(answer)
|
42 |
+
correct_answer_letter = chr(65 + answer_index)
|
43 |
+
|
44 |
+
|
45 |
+
# Construct the prompt/message
|
46 |
+
instruction = ""
|
47 |
+
prompt = f"Paragraf:\n{row["text"]}\nSoru:{row["question_about_the_text"]}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
48 |
+
message = prompt
|
49 |
+
|
50 |
+
# Get/format answer of the model
|
51 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
52 |
+
responses.append(model_answer)
|
53 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
54 |
+
|
55 |
+
# Print answers
|
56 |
+
print(f"Correct Answer: {correct_answer_letter}")
|
57 |
+
print(f"Model Answer: {model_answer}")
|
58 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
59 |
+
|
60 |
+
# Check if correct based on metric
|
61 |
+
if correct_answer_letter == model_answer_cleaned:
|
62 |
+
true += 1
|
63 |
+
difficulty_results[category]['correct'] += 1
|
64 |
+
|
65 |
+
difficulty_results[category]['total'] += 1
|
66 |
+
|
67 |
+
# Print results categorized by difficulty
|
68 |
+
for category, stats in difficulty_results.items():
|
69 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
70 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
71 |
+
|
72 |
+
print("Results:", responses)
|
73 |
+
print("Overall Accuracy:", true / total_count)
|
74 |
+
acc = accuracy(true, total_count)
|
75 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
76 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
77 |
+
|