aacengiz commited on
Commit
33d2454
·
1 Parent(s): 5912286

Add changes to files

Browse files
src/deepeval/commonsense_reasoning_task.py CHANGED
@@ -10,7 +10,7 @@ class CommonsenseReasoningTask(BaseTask):
10
 
11
  def load_dataset_from_hf(self):
12
  dataset = super().load_dataset_from_hf()
13
- return dataset.select(range(min(2, len(dataset))))
14
 
15
 
16
  def evaluate(self) -> dict[str, Any]:
 
10
 
11
  def load_dataset_from_hf(self):
12
  dataset = super().load_dataset_from_hf()
13
+ return dataset.select(range(min(10, len(dataset))))
14
 
15
 
16
  def evaluate(self) -> dict[str, Any]:
src/deepeval/complex_reasoning.py CHANGED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import ast
6
+
7
+
8
+ class ComplexReasoningTask(BaseTask):
9
+ def __init__(self, model_name):
10
+ super().__init__("metunlp/complex-ales", model_name=model_name)
11
+
12
+ def load_dataset_from_hf(self):
13
+ dataset = super().load_dataset_from_hf()
14
+ return dataset.select(range(min(10, len(dataset))))
15
+
16
+
17
+ def evaluate(self) -> dict[str, Any]:
18
+ responses = []
19
+ correct_answers = []
20
+
21
+ total_count = 0
22
+ true = 0
23
+
24
+ for row in self.dataset:
25
+ total_count += 1
26
+
27
+ # Get values from row
28
+ choices = ast.literal_eval(row["choices"]) # Convert string to list
29
+ formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
30
+ correct_answer_letter = row["answer_choice"]
31
+ correct_answers.append(correct_answer_letter)
32
+
33
+ # Prints for debugging
34
+ print(f"Choices: {choices}")
35
+ print("Type of choices:", type(choices))
36
+
37
+
38
+ # Construct the prompt/message
39
+ instruction = ""
40
+ prompt = f"Soru:\n{row["narrative"]}\n{row["question"]}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
41
+ message = prompt
42
+
43
+ # Get/format answer of the model
44
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
45
+ responses.append(model_answer)
46
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
47
+
48
+ if correct_answer_letter == model_answer_cleaned:
49
+ true += 1
50
+ # Print answers
51
+ print(f"Correct Answer: {correct_answer_letter}")
52
+ print(f"Model Answer: {model_answer}")
53
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
54
+
55
+ print("Answers:", correct_answers)
56
+ print("Results:", responses)
57
+ print("Overall Accuracy:", true / total_count)
58
+ acc = accuracy(true, total_count)
59
+ acc_stderr = accuracy_standard_error(acc, total_count)
60
+ return {"acc": acc, "acc_stderr": acc_stderr}
61
+
src/deepeval/deepeval_task_manager.py CHANGED
@@ -10,6 +10,9 @@ from src.deepeval.toxicity_task import ToxicityTask
10
  from src.deepeval.bias_task import BiasTask
11
  from src.deepeval.instruction_following_task import InstructionFollowingTask
12
  from src.deepeval.reading_comprehension_task import ReadingComprehensionTask
 
 
 
13
  from typing import List
14
  load_dotenv()
15
 
@@ -31,6 +34,9 @@ class Task(Enum):
31
  INSTRUCTION_FOLLOWING = "instruction_following_tr"
32
  READING_COMPREHENSION = "reading_comprehension_tr"
33
  COMMONSENSE_REASONING = "commonsense_reasoning"
 
 
 
34
 
35
 
36
  class DeepEvalTaskManager:
@@ -71,11 +77,6 @@ class DeepEvalTaskManager:
71
  res = turkish_general_knowledge_task.evaluate()
72
  return res
73
 
74
- def commonsense_reasoning(self):
75
- commonsense_reasoning_task = CommonsenseReasoningTask(self.model_name)
76
- res = commonsense_reasoning_task.evaluate()
77
- return res
78
-
79
  def summarization_tr(self):
80
  task = SummarizationTask(self.model_name)
81
  return task.evaluate()
@@ -100,7 +101,27 @@ class DeepEvalTaskManager:
100
  task = ReadingComprehensionTask(self.model_name)
101
  return task.evaluate()
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  if __name__ == "__main__":
104
- des = DeepEvalTaskManager("meta-llama/Llama-3.2-1B-Instruct", ["SENTIMENT_ANALYSIS", "SUMMARIZATION", "FAITHFULNESS", "TOXICITY", "BIAS", "INSTRUCTION_FOLLOWING","READING_COMPREHENSION"])
105
  res = des.run_tasks()
106
  print(res)
 
10
  from src.deepeval.bias_task import BiasTask
11
  from src.deepeval.instruction_following_task import InstructionFollowingTask
12
  from src.deepeval.reading_comprehension_task import ReadingComprehensionTask
13
+ from src.deepeval.reading_comp_mc import ReadingComprehensionMCTask
14
+ from src.deepeval.complex_reasoning import ComplexReasoningTask
15
+ from src.deepeval.nli import NLITask
16
  from typing import List
17
  load_dotenv()
18
 
 
34
  INSTRUCTION_FOLLOWING = "instruction_following_tr"
35
  READING_COMPREHENSION = "reading_comprehension_tr"
36
  COMMONSENSE_REASONING = "commonsense_reasoning"
37
+ READING_COMPREHENSION_MC = "reading_comprehension_mc"
38
+ COMPLEX_REASONING = "complex_reasoning"
39
+ NLI = "nli"
40
 
41
 
42
  class DeepEvalTaskManager:
 
77
  res = turkish_general_knowledge_task.evaluate()
78
  return res
79
 
 
 
 
 
 
80
  def summarization_tr(self):
81
  task = SummarizationTask(self.model_name)
82
  return task.evaluate()
 
101
  task = ReadingComprehensionTask(self.model_name)
102
  return task.evaluate()
103
 
104
+ def commonsense_reasoning(self):
105
+ commonsense_reasoning_task = CommonsenseReasoningTask(self.model_name)
106
+ res = commonsense_reasoning_task.evaluate()
107
+ return res
108
+
109
+ def reading_comprehension_mc(self):
110
+ reading_comprehension_mc_task = ReadingComprehensionMCTask(self.model_name)
111
+ res = reading_comprehension_mc_task.evaluate()
112
+ return res
113
+
114
+ def complex_reasoning(self):
115
+ complex_reasoning_task = ComplexReasoningTask(self.model_name)
116
+ res = complex_reasoning_task.evaluate()
117
+ return res
118
+
119
+ def nli(self):
120
+ nli_task = NLITask(self.model_name)
121
+ res = nli_task.evaluate()
122
+ return res
123
+
124
  if __name__ == "__main__":
125
+ des = DeepEvalTaskManager("meta-llama/Llama-3.2-1B-Instruct", ["COMPLEX_REASONING","NLI"])
126
  res = des.run_tasks()
127
  print(res)
src/deepeval/nli.py CHANGED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+
6
+
7
+ class NLITask(BaseTask):
8
+ def __init__(self, model_name):
9
+ super().__init__("metunlp/nli_tr", model_name=model_name)
10
+
11
+ def load_dataset_from_hf(self):
12
+ dataset = super().load_dataset_from_hf()
13
+ return dataset.select(range(min(10, len(dataset))))
14
+
15
+
16
+ def evaluate(self) -> dict[str, Any]:
17
+ responses = []
18
+ difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
19
+ total_count = 0
20
+ true = 0
21
+
22
+ for row in self.dataset:
23
+ total_count += 1
24
+
25
+ # Get values from row
26
+ label = row["label"].lower().replace(' ','')
27
+ choices=["entailment","contradiction","neutral"]
28
+ formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
29
+ category = row["difficulty"]
30
+ correct_answer_letter = "A" if label == "entailment" else \
31
+ "B" if label == "contradiction" else \
32
+ "C" if label == "neutral" else None
33
+
34
+
35
+ # Prints for debugging
36
+ print(f"Choices: {choices}")
37
+ print("Type of choices:", type(choices))
38
+ print("Label:", label)
39
+
40
+ # Construct the prompt/message
41
+ instruction = ""
42
+ question = "Yukarıdaki cümleler arasındaki ilişki “entailment” (bir cümle diğerini ima eder), “neutral (cümleler birbirini ima etmez ve çelişmez) veya “contradiction (cümleler birbirleriyle çelişir) olarak karakterize edilebilir. Bu ilişkilerden hangisi olduğunu söyleyin."
43
+ context = f"Bağlam:\n{row["text"]}\n" # can add to prompt if needed
44
+ prompt = f"Cümle1:\n{row["premise"]}\nCümle2:{row["hypothesis"]}\nSoru:\n{question}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
45
+ message = prompt
46
+
47
+ # Get/format answer of the model
48
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=10)
49
+ responses.append(model_answer)
50
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
51
+
52
+ # Print answers
53
+ print(f"Correct Answer: {correct_answer_letter}")
54
+ print(f"Model Answer: {model_answer}")
55
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
56
+
57
+ # Check if correct based on metric
58
+ if correct_answer_letter == model_answer_cleaned:
59
+ true += 1
60
+ difficulty_results[category]['correct'] += 1
61
+
62
+ difficulty_results[category]['total'] += 1
63
+
64
+ # Print results categorized by difficulty
65
+ for category, stats in difficulty_results.items():
66
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
67
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
68
+
69
+ print("Results:", responses)
70
+ print("Overall Accuracy:", true / total_count)
71
+ acc = accuracy(true, total_count)
72
+ acc_stderr = accuracy_standard_error(acc, total_count)
73
+ return {"acc": acc, "acc_stderr": acc_stderr}
74
+
src/deepeval/reading_comp_mc.py CHANGED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import ast
6
+
7
+
8
+ class ReadingComprehensionMCTask(BaseTask):
9
+ def __init__(self, model_name):
10
+ super().__init__("metunlp/reading_comp_mc", model_name=model_name)
11
+
12
+ def load_dataset_from_hf(self):
13
+ dataset = super().load_dataset_from_hf()
14
+ return dataset.select(range(min(10, len(dataset))))
15
+
16
+
17
+ def evaluate(self) -> dict[str, Any]:
18
+ responses = []
19
+ difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
20
+ total_count = 0
21
+ true = 0
22
+
23
+ for row in self.dataset:
24
+ total_count += 1
25
+
26
+ # Get values from row
27
+ choices = ast.literal_eval(row["choices"]) # Convert string to list
28
+ formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
29
+ category = row["difficulty"].lower().replace(' ','')
30
+ answer = row["answer"]
31
+
32
+ # Prints for debugging
33
+ print(f"Choices: {choices}")
34
+ print("Type of choices:", type(choices))
35
+ print("Type of answer:", type(answer))
36
+
37
+ # Get answer index (starting from 0)
38
+ if type(answer) == int:
39
+ answer_index = answer
40
+ else:
41
+ answer_index = int(answer)
42
+ correct_answer_letter = chr(65 + answer_index)
43
+
44
+
45
+ # Construct the prompt/message
46
+ instruction = ""
47
+ prompt = f"Paragraf:\n{row["text"]}\nSoru:{row["question_about_the_text"]}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
48
+ message = prompt
49
+
50
+ # Get/format answer of the model
51
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
52
+ responses.append(model_answer)
53
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
54
+
55
+ # Print answers
56
+ print(f"Correct Answer: {correct_answer_letter}")
57
+ print(f"Model Answer: {model_answer}")
58
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
59
+
60
+ # Check if correct based on metric
61
+ if correct_answer_letter == model_answer_cleaned:
62
+ true += 1
63
+ difficulty_results[category]['correct'] += 1
64
+
65
+ difficulty_results[category]['total'] += 1
66
+
67
+ # Print results categorized by difficulty
68
+ for category, stats in difficulty_results.items():
69
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
70
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
71
+
72
+ print("Results:", responses)
73
+ print("Overall Accuracy:", true / total_count)
74
+ acc = accuracy(true, total_count)
75
+ acc_stderr = accuracy_standard_error(acc, total_count)
76
+ return {"acc": acc, "acc_stderr": acc_stderr}
77
+