Ahmet Kaan Sever commited on
Commit
1a88171
·
2 Parent(s): 6807ea3 ddb6316

Merge branch 'main' into deneme

Browse files
src/deepeval/base_task.py CHANGED
@@ -178,7 +178,7 @@ class BaseTask(ABC):
178
  ]
179
  allowed_special_tokens = self.tokenizer.apply_chat_template(allowed_token_chat, tokenize=True)
180
  return allowed_special_tokens
181
-
182
  @abstractmethod
183
  def load_dataset_from_hf(self):
184
  """
 
178
  ]
179
  allowed_special_tokens = self.tokenizer.apply_chat_template(allowed_token_chat, tokenize=True)
180
  return allowed_special_tokens
181
+
182
  @abstractmethod
183
  def load_dataset_from_hf(self):
184
  """
src/deepeval/commonsense_reasoning_task.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+
6
+
7
+ class CommonsenseReasoningTask(BaseTask):
8
+ def __init__(self, model_name):
9
+ super().__init__("metunlp/commonsense", model_name=model_name)
10
+
11
+ def load_dataset_from_hf(self):
12
+ dataset = super().load_dataset_from_hf()
13
+ return dataset.select(range(min(10, len(dataset))))
14
+
15
+
16
+ def evaluate(self) -> dict[str, Any]:
17
+ responses = []
18
+ difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
19
+ total_count = 0
20
+ true = 0
21
+
22
+ for row in self.dataset:
23
+ total_count += 1
24
+
25
+ # Get values from row
26
+ label = row["label"]
27
+ choices=[row["choice1"], row["choice2"]]
28
+ formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
29
+ category = row["difficulty"]
30
+ answer = row["answer"]
31
+ text = row["text"]
32
+ context = row["context"]
33
+
34
+ # Prints for debugging
35
+ print(f"Choices: {choices}")
36
+ print("Type of choices:", type(choices))
37
+ print("Type of answer:", type(answer))
38
+
39
+ # Get answer index (starting from 0)
40
+ if type(answer) == int:
41
+ answer_index = answer - 1 # 1 or 2
42
+ else:
43
+ answer_index = int(answer) - 1
44
+ correct_answer_letter = chr(65 + answer_index)
45
+
46
+ # Get question based on label
47
+ if label == "effect":
48
+ question = "Seçeneklerden hangisi verilen önermenin bir sonucu veya etkisi olabilir?"
49
+ elif label == "cause":
50
+ question = "Seçeneklerden hangisi verilen önermenin bir neden veya sebebi olabilir?"
51
+ else:
52
+ question = "Seçeneklerden hangisi uygun?" # Alternatif
53
+
54
+ # Construct the prompt/message
55
+ instruction = ""
56
+ prompt = f"Bağlam:\n{text}\nÖnerme:\n{context}\nSoru:{question}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
57
+ message = prompt
58
+
59
+ # Get/format answer of the model
60
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=10)
61
+ responses.append(model_answer)
62
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
63
+
64
+ # Print answers
65
+ print(f"Correct Answer: {correct_answer_letter}")
66
+ print(f"Model Answer: {model_answer}")
67
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
68
+
69
+ # Check if correct based on metric
70
+ if correct_answer_letter == model_answer_cleaned:
71
+ true += 1
72
+ difficulty_results[category]['correct'] += 1
73
+
74
+ difficulty_results[category]['total'] += 1
75
+
76
+ # Print results categorized by difficulty
77
+ for category, stats in difficulty_results.items():
78
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
79
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
80
+
81
+ print("Results:", responses)
82
+ print("Overall Accuracy:", true / total_count)
83
+ acc = accuracy(true, total_count)
84
+ acc_stderr = accuracy_standard_error(acc, total_count)
85
+ return {"acc": acc, "acc_stderr": acc_stderr}
86
+
src/deepeval/complex_reasoning.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import ast
6
+
7
+
8
+ class ComplexReasoningTask(BaseTask):
9
+ def __init__(self, model_name):
10
+ super().__init__("metunlp/complex-ales", model_name=model_name)
11
+
12
+ def load_dataset_from_hf(self):
13
+ dataset = super().load_dataset_from_hf()
14
+ return dataset.select(range(min(10, len(dataset))))
15
+
16
+
17
+ def evaluate(self) -> dict[str, Any]:
18
+ responses = []
19
+ correct_answers = []
20
+
21
+ total_count = 0
22
+ true = 0
23
+
24
+ for row in self.dataset:
25
+ total_count += 1
26
+
27
+ # Get values from row
28
+ choices = ast.literal_eval(row["choices"]) # Convert string to list
29
+ narrative = row["narrative"]
30
+ question = row["question"]
31
+ formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
32
+ correct_answer_letter = row["answer_choice"]
33
+ correct_answers.append(correct_answer_letter)
34
+
35
+ # Prints for debugging
36
+ print(f"Choices: {choices}")
37
+ print("Type of choices:", type(choices))
38
+
39
+
40
+ # Construct the prompt/message
41
+ instruction = ""
42
+ prompt = f"Soru:\n{narrative}\n{question}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
43
+ message = prompt
44
+
45
+ # Get/format answer of the model
46
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
47
+ responses.append(model_answer)
48
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
49
+
50
+ if correct_answer_letter == model_answer_cleaned:
51
+ true += 1
52
+ # Print answers
53
+ print(f"Correct Answer: {correct_answer_letter}")
54
+ print(f"Model Answer: {model_answer}")
55
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
56
+
57
+ print("Answers:", correct_answers)
58
+ print("Results:", responses)
59
+ print("Overall Accuracy:", true / total_count)
60
+ acc = accuracy(true, total_count)
61
+ acc_stderr = accuracy_standard_error(acc, total_count)
62
+ return {"acc": acc, "acc_stderr": acc_stderr}
63
+
src/deepeval/deepeval_task_manager.py CHANGED
@@ -1,112 +1,140 @@
1
- import os
2
- from src.deepeval.turkish_general_knowledge_task import TurkishGeneralKnowledgeTask
3
- from dotenv import load_dotenv
4
- from enum import Enum
5
- from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask
6
- from src.deepeval.summarization_task import SummarizationTask
7
- from src.deepeval.faithfulness_task import FaithfulnessTask
8
- from src.deepeval.toxicity_task import ToxicityTask
9
- from src.deepeval.bias_task import BiasTask
10
- from src.deepeval.instruction_following_task import InstructionFollowingTask
11
- from src.deepeval.reading_comprehension_task import ReadingComprehensionTask
12
- from src.deepeval.truthfulness_task import TruthfulnessTask
13
- from typing import List
14
- load_dotenv()
15
-
16
- openai_configs = {
17
- 'OPENAI_API_KEY': 'OPENAI_KEY'
18
- }
19
- os.environ['OPENAI_API_KEY'] = openai_configs['OPENAI_API_KEY']
20
-
21
- HF_TOKEN=os.getenv("HF_TOKEN")
22
-
23
- class Task(Enum):
24
- # SUMMARIZATION = "summarization"
25
- SENTIMENT_ANALYSIS = "sentiment_analysis_tr"
26
- TURKISH_GENERAL_KNOWLEDGE = "turkish_general_knowledge"
27
- SUMMARIZATION = "summarization_tr"
28
- FAITHFULNESS = "sosyoloji_faithfulness"
29
- TOXICITY = "sosyoloji_toxicity"
30
- BIAS = "sosyoloji_bias"
31
- INSTRUCTION_FOLLOWING = "instruction_following_tr"
32
- READING_COMPREHENSION = "reading_comp_oe"
33
- TRUTHFULNESS = "sosyoloji_truthfulness"
34
-
35
-
36
- class DeepEvalTaskManager:
37
- def __init__(self, model_name, tasks: List[str]):
38
- self.model_name = model_name
39
- self.available_tasks = {task.name: getattr(self, task.value) for task in Task}
40
- self.tasks_to_run = self.validate_tasks(tasks)
41
-
42
- def validate_tasks(self, user_tasks):
43
- """Validate user tasks and store method references."""
44
- print(self.available_tasks.keys())
45
- print(user_tasks)
46
- if not set(user_tasks).issubset(self.available_tasks.keys()):
47
- invalid_tasks = set(user_tasks) - self.available_tasks.keys()
48
- raise ValueError(f"Invalid task(s) requested: {invalid_tasks}")
49
-
50
- # Store actual method references instead of strings
51
- return {task : self.available_tasks[task] for task in user_tasks}
52
-
53
- def run_tasks(self):
54
- """Execute validated tasks in order."""
55
- results = {}
56
- for task_name, task_method in self.tasks_to_run.items():
57
- print("Running task: ", task_name)
58
- task_enum = getattr(Task, task_name)
59
- task_value = task_enum.value
60
- results[task_value] = task_method() # Call the stored method reference
61
-
62
- return results
63
-
64
- def sentiment_analysis_tr(self):
65
- st_task = SentimentAnalysisTask(self.model_name)
66
- res = st_task.evaluate()
67
- return res
68
-
69
- def turkish_general_knowledge(self):
70
- turkish_general_knowledge_task = TurkishGeneralKnowledgeTask(self.model_name)
71
- res = turkish_general_knowledge_task.evaluate()
72
- return res
73
-
74
- def summarization_tr(self):
75
- summarization_task = SummarizationTask(self.model_name)
76
- res = summarization_task.evaluate()
77
- return res
78
-
79
- def sosyoloji_faithfulness(self):
80
- faithfulness_task = FaithfulnessTask(self.model_name)
81
- res = faithfulness_task.evaluate()
82
- return res
83
-
84
- def sosyoloji_toxicity(self):
85
- toxicity_task = ToxicityTask(self.model_name)
86
- res = toxicity_task.evaluate()
87
- return res
88
-
89
- def sosyoloji_bias(self):
90
- bias_task = BiasTask(self.model_name)
91
- res = bias_task.evaluate()
92
- return res
93
-
94
- def instruction_following_tr(self):
95
- instruction_following_task = InstructionFollowingTask(self.model_name)
96
- res = instruction_following_task.evaluate()
97
- return res
98
-
99
- def reading_comp_oe(self):
100
- reading_comprehension_task = ReadingComprehensionTask(self.model_name)
101
- res = reading_comprehension_task.evaluate()
102
- return res
103
-
104
- def sosyoloji_truthfulness(self):
105
- truthfulness_task = TruthfulnessTask(self.model_name)
106
- res = truthfulness_task.evaluate()
107
- return res
108
-
109
- if __name__ == "__main__":
110
- des = DeepEvalTaskManager("meta-llama/Llama-3.2-1B-Instruct", ["SENTIMENT_ANALYSIS", "SUMMARIZATION", "FAITHFULNESS", "TOXICITY", "BIAS", "INSTRUCTION_FOLLOWING","READING_COMPREHENSION", "TRUTHFULNESS"])
111
- res = des.run_tasks()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  print(res)
 
1
+ import os
2
+ from src.deepeval.turkish_general_knowledge_task import TurkishGeneralKnowledgeTask
3
+ from dotenv import load_dotenv
4
+ from enum import Enum
5
+ from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask
6
+ from src.deepeval.commonsense_reasoning_task import CommonsenseReasoningTask
7
+ from src.deepeval.summarization_task import SummarizationTask
8
+ from src.deepeval.faithfulness_task import FaithfulnessTask
9
+ from src.deepeval.toxicity_task import ToxicityTask
10
+ from src.deepeval.bias_task import BiasTask
11
+ from src.deepeval.instruction_following_task import InstructionFollowingTask
12
+ from src.deepeval.reading_comprehension_task import ReadingComprehensionTask
13
+ from src.deepeval.reading_comp_mc import ReadingComprehensionMCTask
14
+ from src.deepeval.complex_reasoning import ComplexReasoningTask
15
+ from src.deepeval.truthfulness_task import TruthfulnessTask
16
+ from src.deepeval.nli import NLITask
17
+ from typing import List
18
+ load_dotenv()
19
+
20
+ openai_configs = {
21
+ 'OPENAI_API_KEY': 'OPENAI_KEY'
22
+ }
23
+ os.environ['OPENAI_API_KEY'] = openai_configs['OPENAI_API_KEY']
24
+
25
+ HF_TOKEN=os.getenv("HF_TOKEN")
26
+
27
+ class Task(Enum):
28
+ # SUMMARIZATION = "summarization"
29
+ SENTIMENT_ANALYSIS = "sentiment_analysis_tr"
30
+ TURKISH_GENERAL_KNOWLEDGE = "turkish_general_knowledge"
31
+ SUMMARIZATION = "summarization_tr"
32
+ FAITHFULNESS = "sosyoloji_faithfulness"
33
+ TOXICITY = "sosyoloji_toxicity"
34
+ BIAS = "sosyoloji_bias"
35
+ INSTRUCTION_FOLLOWING = "instruction_following_tr"
36
+ READING_COMPREHENSION = "reading_comprehension_mc"
37
+ READING_COMPREHENSION_OE = "reading_comp_oe"
38
+ COMMONSENSE_REASONING = "commonsense_reasoning"
39
+ COMPLEX_REASONING = "complex_reasoning"
40
+ TRUTHFULNESS = "sosyoloji_truthfulness"
41
+ NLI = "nli"
42
+
43
+
44
+ class DeepEvalTaskManager:
45
+ def __init__(self, model_name, tasks: List[str]):
46
+ self.model_name = model_name
47
+ self.available_tasks = {task.name: getattr(self, task.value) for task in Task}
48
+ self.tasks_to_run = self.validate_tasks(tasks)
49
+
50
+ def validate_tasks(self, user_tasks):
51
+ """Validate user tasks and store method references."""
52
+ print(self.available_tasks.keys())
53
+ print(user_tasks)
54
+ if not set(user_tasks).issubset(self.available_tasks.keys()):
55
+ invalid_tasks = set(user_tasks) - self.available_tasks.keys()
56
+ raise ValueError(f"Invalid task(s) requested: {invalid_tasks}")
57
+
58
+ # Store actual method references instead of strings
59
+ return {task : self.available_tasks[task] for task in user_tasks}
60
+
61
+ def run_tasks(self):
62
+ """Execute validated tasks in order."""
63
+ results = {}
64
+ for task_name, task_method in self.tasks_to_run.items():
65
+ print("Running task: ", task_name)
66
+ task_enum = getattr(Task, task_name)
67
+ task_value = task_enum.value
68
+ results[task_value] = task_method() # Call the stored method reference
69
+
70
+ return results
71
+
72
+ def sentiment_analysis_tr(self):
73
+ st_task = SentimentAnalysisTask(self.model_name)
74
+ res = st_task.evaluate()
75
+ return res
76
+
77
+ def turkish_general_knowledge(self):
78
+ turkish_general_knowledge_task = TurkishGeneralKnowledgeTask(self.model_name)
79
+ res = turkish_general_knowledge_task.evaluate()
80
+ return res
81
+
82
+ def summarization_tr(self):
83
+ summarization_task = SummarizationTask(self.model_name)
84
+ res = summarization_task.evaluate()
85
+ return res
86
+
87
+ def sosyoloji_faithfulness(self):
88
+ faithfulness_task = FaithfulnessTask(self.model_name)
89
+ res = faithfulness_task.evaluate()
90
+ return res
91
+
92
+ def sosyoloji_toxicity(self):
93
+ toxicity_task = ToxicityTask(self.model_name)
94
+ res = toxicity_task.evaluate()
95
+ return res
96
+
97
+ def sosyoloji_bias(self):
98
+ bias_task = BiasTask(self.model_name)
99
+ res = bias_task.evaluate()
100
+ return res
101
+
102
+ def instruction_following_tr(self):
103
+ instruction_following_task = InstructionFollowingTask(self.model_name)
104
+ res = instruction_following_task.evaluate()
105
+ return res
106
+
107
+ def reading_comprehension_mc(self):
108
+ reading_comprehension_mc_task = ReadingComprehensionMCTask(self.model_name)
109
+ res = reading_comprehension_mc_task.evaluate()
110
+ return res
111
+
112
+ def reading_comp_oe(self):
113
+ reading_comprehension_task = ReadingComprehensionTask(self.model_name)
114
+ res = reading_comprehension_task.evaluate()
115
+ return res
116
+
117
+ def commonsense_reasoning(self):
118
+ commonsense_reasoning_task = CommonsenseReasoningTask(self.model_name)
119
+ res = commonsense_reasoning_task.evaluate()
120
+ return res
121
+
122
+ def complex_reasoning(self):
123
+ complex_reasoning_task = ComplexReasoningTask(self.model_name)
124
+ res = complex_reasoning_task.evaluate()
125
+ return res
126
+
127
+ def sosyoloji_truthfulness(self):
128
+ truthfulness_task = TruthfulnessTask(self.model_name)
129
+ res = truthfulness_task.evaluate()
130
+ return res
131
+
132
+ def nli(self):
133
+ nli_task = NLITask(self.model_name)
134
+ res = nli_task.evaluate()
135
+ return res
136
+
137
+ if __name__ == "__main__":
138
+ des = DeepEvalTaskManager("meta-llama/Llama-3.2-1B-Instruct", ["COMPLEX_REASONING","NLI"])
139
+ res = des.run_tasks()
140
  print(res)
src/deepeval/nli.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+
6
+
7
+ class NLITask(BaseTask):
8
+ def __init__(self, model_name):
9
+ super().__init__("metunlp/nli_tr", model_name=model_name)
10
+
11
+ def load_dataset_from_hf(self):
12
+ dataset = super().load_dataset_from_hf()
13
+ return dataset.select(range(min(10, len(dataset))))
14
+
15
+
16
+ def evaluate(self) -> dict[str, Any]:
17
+ responses = []
18
+ difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
19
+ total_count = 0
20
+ true = 0
21
+
22
+ for row in self.dataset:
23
+ total_count += 1
24
+
25
+ # Get values from row
26
+ text = row["text"]
27
+ premise = row["premise"]
28
+ hypothesis = row["hypothesis"]
29
+ label = row["label"].lower().replace(' ','')
30
+ choices=["entailment","contradiction","neutral"]
31
+ formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
32
+ category = row["difficulty"]
33
+ correct_answer_letter = "A" if label == "entailment" else \
34
+ "B" if label == "contradiction" else \
35
+ "C" if label == "neutral" else None
36
+
37
+
38
+ # Prints for debugging
39
+ print(f"Choices: {choices}")
40
+ print("Type of choices:", type(choices))
41
+ print("Label:", label)
42
+
43
+ # Construct the prompt/message
44
+ instruction = ""
45
+ question = "Yukarıdaki cümleler arasındaki ilişki “entailment” (bir cümle diğerini ima eder), “neutral (cümleler birbirini ima etmez ve çelişmez) veya “contradiction (cümleler birbirleriyle çelişir) olarak karakterize edilebilir. Bu ilişkilerden hangisi olduğunu söyleyin."
46
+ context = f"Bağlam:\n{text}\n" # can add to prompt if needed
47
+ prompt = f"Cümle1:\n{premise}\nCümle2:{hypothesis}\nSoru:\n{question}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
48
+ message = prompt
49
+
50
+ # Get/format answer of the model
51
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=10)
52
+ responses.append(model_answer)
53
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
54
+
55
+ # Print answers
56
+ print(f"Correct Answer: {correct_answer_letter}")
57
+ print(f"Model Answer: {model_answer}")
58
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
59
+
60
+ # Check if correct based on metric
61
+ if correct_answer_letter == model_answer_cleaned:
62
+ true += 1
63
+ difficulty_results[category]['correct'] += 1
64
+
65
+ difficulty_results[category]['total'] += 1
66
+
67
+ # Print results categorized by difficulty
68
+ for category, stats in difficulty_results.items():
69
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
70
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
71
+
72
+ print("Results:", responses)
73
+ print("Overall Accuracy:", true / total_count)
74
+ acc = accuracy(true, total_count)
75
+ acc_stderr = accuracy_standard_error(acc, total_count)
76
+ return {"acc": acc, "acc_stderr": acc_stderr}
77
+
src/deepeval/reading_comp_mc.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.deepeval.base_task import BaseTask
2
+ from collections import defaultdict
3
+ from src.deepeval.utils import accuracy, accuracy_standard_error
4
+ from typing import Any
5
+ import ast
6
+
7
+
8
+ class ReadingComprehensionMCTask(BaseTask):
9
+ def __init__(self, model_name):
10
+ super().__init__("metunlp/reading_comp_mc", model_name=model_name)
11
+
12
+ def load_dataset_from_hf(self):
13
+ dataset = super().load_dataset_from_hf()
14
+ return dataset.select(range(min(10, len(dataset))))
15
+
16
+
17
+ def evaluate(self) -> dict[str, Any]:
18
+ responses = []
19
+ difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
20
+ total_count = 0
21
+ true = 0
22
+
23
+ for row in self.dataset:
24
+ total_count += 1
25
+
26
+ # Get values from row
27
+ choices = ast.literal_eval(row["choices"]) # Convert string to list
28
+ formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
29
+ category = row["difficulty"].lower().replace(' ','')
30
+ answer = row["answer"]
31
+ text = row["text"]
32
+ question_about_the_text = row["question_about_the_text"]
33
+
34
+ # Prints for debugging
35
+ print(f"Choices: {choices}")
36
+ print("Type of choices:", type(choices))
37
+ print("Type of answer:", type(answer))
38
+
39
+ # Get answer index (starting from 0)
40
+ if type(answer) == int:
41
+ answer_index = answer
42
+ else:
43
+ answer_index = int(answer)
44
+ correct_answer_letter = chr(65 + answer_index)
45
+
46
+
47
+ # Construct the prompt/message
48
+ instruction = ""
49
+ prompt = f"Paragraf:\n{text}\nSoru:{question_about_the_text}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
50
+ message = prompt
51
+
52
+ # Get/format answer of the model
53
+ model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
54
+ responses.append(model_answer)
55
+ model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
56
+
57
+ # Print answers
58
+ print(f"Correct Answer: {correct_answer_letter}")
59
+ print(f"Model Answer: {model_answer}")
60
+ print(f"Model Answer Cleaned: {model_answer_cleaned}")
61
+
62
+ # Check if correct based on metric
63
+ if correct_answer_letter == model_answer_cleaned:
64
+ true += 1
65
+ difficulty_results[category]['correct'] += 1
66
+
67
+ difficulty_results[category]['total'] += 1
68
+
69
+ # Print results categorized by difficulty
70
+ for category, stats in difficulty_results.items():
71
+ calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
72
+ print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
73
+
74
+ print("Results:", responses)
75
+ print("Overall Accuracy:", true / total_count)
76
+ acc = accuracy(true, total_count)
77
+ acc_stderr = accuracy_standard_error(acc, total_count)
78
+ return {"acc": acc, "acc_stderr": acc_stderr}
79
+
src/deepeval/sentiment_analysis_task.py CHANGED
@@ -7,6 +7,7 @@ class SentimentAnalysisTask(BaseTask):
7
  super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name)
8
 
9
  def load_dataset_from_hf(self):
 
10
  dataset = super().load_dataset_from_hf()
11
  return dataset.select(range(min(10, len(dataset))))
12
 
 
7
  super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name)
8
 
9
  def load_dataset_from_hf(self):
10
+ print("Loading the dataset")
11
  dataset = super().load_dataset_from_hf()
12
  return dataset.select(range(min(10, len(dataset))))
13
 
src/deepeval/turkish_general_knowledge_task.py CHANGED
@@ -9,7 +9,7 @@ class TurkishGeneralKnowledgeTask(BaseTask):
9
 
10
  def load_dataset_from_hf(self):
11
  dataset = super().load_dataset_from_hf()
12
- return dataset.select(range(min(10, len(dataset))))
13
 
14
  def evaluate(self):
15
  responses = []
 
9
 
10
  def load_dataset_from_hf(self):
11
  dataset = super().load_dataset_from_hf()
12
+ return dataset.select(range(min(1, len(dataset))))
13
 
14
  def evaluate(self):
15
  responses = []
svc/router.py CHANGED
@@ -43,6 +43,10 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(
43
  async def protected_route(username: str = Depends(get_current_user)):
44
  return {"message": f"Hello, {username}! This is a protected resource."}
45
 
 
 
 
 
46
 
47
  @router.post("/chat", response_model=TaskResponse)
48
  def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_current_user)):
@@ -78,7 +82,6 @@ def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_c
78
  return TaskResponse(results=dumped)
79
 
80
 
81
-
82
  @router.post("/deepeval/eval", response_model=TaskResponse)
83
  async def deep_eval_suite(request: DeepEvalSuiteRequest):
84
  des = DeepEvalTaskManager(request.model_name, request.tasks)
 
43
  async def protected_route(username: str = Depends(get_current_user)):
44
  return {"message": f"Hello, {username}! This is a protected resource."}
45
 
46
+ @router.get("/deepeval/status")
47
+ async def deep_eval_status():
48
+ #Return running with 200 status code
49
+ return {"status": "running"}
50
 
51
  @router.post("/chat", response_model=TaskResponse)
52
  def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_current_user)):
 
82
  return TaskResponse(results=dumped)
83
 
84
 
 
85
  @router.post("/deepeval/eval", response_model=TaskResponse)
86
  async def deep_eval_suite(request: DeepEvalSuiteRequest):
87
  des = DeepEvalTaskManager(request.model_name, request.tasks)