Spaces:
Paused
Paused
Merge branch 'main' into deneme
Browse files- src/deepeval/base_task.py +1 -1
- src/deepeval/commonsense_reasoning_task.py +86 -0
- src/deepeval/complex_reasoning.py +63 -0
- src/deepeval/deepeval_task_manager.py +139 -111
- src/deepeval/nli.py +77 -0
- src/deepeval/reading_comp_mc.py +79 -0
- src/deepeval/sentiment_analysis_task.py +1 -0
- src/deepeval/turkish_general_knowledge_task.py +1 -1
- svc/router.py +4 -1
src/deepeval/base_task.py
CHANGED
@@ -178,7 +178,7 @@ class BaseTask(ABC):
|
|
178 |
]
|
179 |
allowed_special_tokens = self.tokenizer.apply_chat_template(allowed_token_chat, tokenize=True)
|
180 |
return allowed_special_tokens
|
181 |
-
|
182 |
@abstractmethod
|
183 |
def load_dataset_from_hf(self):
|
184 |
"""
|
|
|
178 |
]
|
179 |
allowed_special_tokens = self.tokenizer.apply_chat_template(allowed_token_chat, tokenize=True)
|
180 |
return allowed_special_tokens
|
181 |
+
|
182 |
@abstractmethod
|
183 |
def load_dataset_from_hf(self):
|
184 |
"""
|
src/deepeval/commonsense_reasoning_task.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
|
6 |
+
|
7 |
+
class CommonsenseReasoningTask(BaseTask):
|
8 |
+
def __init__(self, model_name):
|
9 |
+
super().__init__("metunlp/commonsense", model_name=model_name)
|
10 |
+
|
11 |
+
def load_dataset_from_hf(self):
|
12 |
+
dataset = super().load_dataset_from_hf()
|
13 |
+
return dataset.select(range(min(10, len(dataset))))
|
14 |
+
|
15 |
+
|
16 |
+
def evaluate(self) -> dict[str, Any]:
|
17 |
+
responses = []
|
18 |
+
difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
|
19 |
+
total_count = 0
|
20 |
+
true = 0
|
21 |
+
|
22 |
+
for row in self.dataset:
|
23 |
+
total_count += 1
|
24 |
+
|
25 |
+
# Get values from row
|
26 |
+
label = row["label"]
|
27 |
+
choices=[row["choice1"], row["choice2"]]
|
28 |
+
formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
|
29 |
+
category = row["difficulty"]
|
30 |
+
answer = row["answer"]
|
31 |
+
text = row["text"]
|
32 |
+
context = row["context"]
|
33 |
+
|
34 |
+
# Prints for debugging
|
35 |
+
print(f"Choices: {choices}")
|
36 |
+
print("Type of choices:", type(choices))
|
37 |
+
print("Type of answer:", type(answer))
|
38 |
+
|
39 |
+
# Get answer index (starting from 0)
|
40 |
+
if type(answer) == int:
|
41 |
+
answer_index = answer - 1 # 1 or 2
|
42 |
+
else:
|
43 |
+
answer_index = int(answer) - 1
|
44 |
+
correct_answer_letter = chr(65 + answer_index)
|
45 |
+
|
46 |
+
# Get question based on label
|
47 |
+
if label == "effect":
|
48 |
+
question = "Seçeneklerden hangisi verilen önermenin bir sonucu veya etkisi olabilir?"
|
49 |
+
elif label == "cause":
|
50 |
+
question = "Seçeneklerden hangisi verilen önermenin bir neden veya sebebi olabilir?"
|
51 |
+
else:
|
52 |
+
question = "Seçeneklerden hangisi uygun?" # Alternatif
|
53 |
+
|
54 |
+
# Construct the prompt/message
|
55 |
+
instruction = ""
|
56 |
+
prompt = f"Bağlam:\n{text}\nÖnerme:\n{context}\nSoru:{question}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
57 |
+
message = prompt
|
58 |
+
|
59 |
+
# Get/format answer of the model
|
60 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=10)
|
61 |
+
responses.append(model_answer)
|
62 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
|
63 |
+
|
64 |
+
# Print answers
|
65 |
+
print(f"Correct Answer: {correct_answer_letter}")
|
66 |
+
print(f"Model Answer: {model_answer}")
|
67 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
68 |
+
|
69 |
+
# Check if correct based on metric
|
70 |
+
if correct_answer_letter == model_answer_cleaned:
|
71 |
+
true += 1
|
72 |
+
difficulty_results[category]['correct'] += 1
|
73 |
+
|
74 |
+
difficulty_results[category]['total'] += 1
|
75 |
+
|
76 |
+
# Print results categorized by difficulty
|
77 |
+
for category, stats in difficulty_results.items():
|
78 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
79 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
80 |
+
|
81 |
+
print("Results:", responses)
|
82 |
+
print("Overall Accuracy:", true / total_count)
|
83 |
+
acc = accuracy(true, total_count)
|
84 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
85 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
86 |
+
|
src/deepeval/complex_reasoning.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import ast
|
6 |
+
|
7 |
+
|
8 |
+
class ComplexReasoningTask(BaseTask):
|
9 |
+
def __init__(self, model_name):
|
10 |
+
super().__init__("metunlp/complex-ales", model_name=model_name)
|
11 |
+
|
12 |
+
def load_dataset_from_hf(self):
|
13 |
+
dataset = super().load_dataset_from_hf()
|
14 |
+
return dataset.select(range(min(10, len(dataset))))
|
15 |
+
|
16 |
+
|
17 |
+
def evaluate(self) -> dict[str, Any]:
|
18 |
+
responses = []
|
19 |
+
correct_answers = []
|
20 |
+
|
21 |
+
total_count = 0
|
22 |
+
true = 0
|
23 |
+
|
24 |
+
for row in self.dataset:
|
25 |
+
total_count += 1
|
26 |
+
|
27 |
+
# Get values from row
|
28 |
+
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
29 |
+
narrative = row["narrative"]
|
30 |
+
question = row["question"]
|
31 |
+
formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
|
32 |
+
correct_answer_letter = row["answer_choice"]
|
33 |
+
correct_answers.append(correct_answer_letter)
|
34 |
+
|
35 |
+
# Prints for debugging
|
36 |
+
print(f"Choices: {choices}")
|
37 |
+
print("Type of choices:", type(choices))
|
38 |
+
|
39 |
+
|
40 |
+
# Construct the prompt/message
|
41 |
+
instruction = ""
|
42 |
+
prompt = f"Soru:\n{narrative}\n{question}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
43 |
+
message = prompt
|
44 |
+
|
45 |
+
# Get/format answer of the model
|
46 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
47 |
+
responses.append(model_answer)
|
48 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
49 |
+
|
50 |
+
if correct_answer_letter == model_answer_cleaned:
|
51 |
+
true += 1
|
52 |
+
# Print answers
|
53 |
+
print(f"Correct Answer: {correct_answer_letter}")
|
54 |
+
print(f"Model Answer: {model_answer}")
|
55 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
56 |
+
|
57 |
+
print("Answers:", correct_answers)
|
58 |
+
print("Results:", responses)
|
59 |
+
print("Overall Accuracy:", true / total_count)
|
60 |
+
acc = accuracy(true, total_count)
|
61 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
62 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
63 |
+
|
src/deepeval/deepeval_task_manager.py
CHANGED
@@ -1,112 +1,140 @@
|
|
1 |
-
import os
|
2 |
-
from src.deepeval.turkish_general_knowledge_task import TurkishGeneralKnowledgeTask
|
3 |
-
from dotenv import load_dotenv
|
4 |
-
from enum import Enum
|
5 |
-
from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask
|
6 |
-
from src.deepeval.
|
7 |
-
from src.deepeval.
|
8 |
-
from src.deepeval.
|
9 |
-
from src.deepeval.
|
10 |
-
from src.deepeval.
|
11 |
-
from src.deepeval.
|
12 |
-
from src.deepeval.
|
13 |
-
from
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
print(res)
|
|
|
1 |
+
import os
|
2 |
+
from src.deepeval.turkish_general_knowledge_task import TurkishGeneralKnowledgeTask
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from enum import Enum
|
5 |
+
from src.deepeval.sentiment_analysis_task import SentimentAnalysisTask
|
6 |
+
from src.deepeval.commonsense_reasoning_task import CommonsenseReasoningTask
|
7 |
+
from src.deepeval.summarization_task import SummarizationTask
|
8 |
+
from src.deepeval.faithfulness_task import FaithfulnessTask
|
9 |
+
from src.deepeval.toxicity_task import ToxicityTask
|
10 |
+
from src.deepeval.bias_task import BiasTask
|
11 |
+
from src.deepeval.instruction_following_task import InstructionFollowingTask
|
12 |
+
from src.deepeval.reading_comprehension_task import ReadingComprehensionTask
|
13 |
+
from src.deepeval.reading_comp_mc import ReadingComprehensionMCTask
|
14 |
+
from src.deepeval.complex_reasoning import ComplexReasoningTask
|
15 |
+
from src.deepeval.truthfulness_task import TruthfulnessTask
|
16 |
+
from src.deepeval.nli import NLITask
|
17 |
+
from typing import List
|
18 |
+
load_dotenv()
|
19 |
+
|
20 |
+
openai_configs = {
|
21 |
+
'OPENAI_API_KEY': 'OPENAI_KEY'
|
22 |
+
}
|
23 |
+
os.environ['OPENAI_API_KEY'] = openai_configs['OPENAI_API_KEY']
|
24 |
+
|
25 |
+
HF_TOKEN=os.getenv("HF_TOKEN")
|
26 |
+
|
27 |
+
class Task(Enum):
|
28 |
+
# SUMMARIZATION = "summarization"
|
29 |
+
SENTIMENT_ANALYSIS = "sentiment_analysis_tr"
|
30 |
+
TURKISH_GENERAL_KNOWLEDGE = "turkish_general_knowledge"
|
31 |
+
SUMMARIZATION = "summarization_tr"
|
32 |
+
FAITHFULNESS = "sosyoloji_faithfulness"
|
33 |
+
TOXICITY = "sosyoloji_toxicity"
|
34 |
+
BIAS = "sosyoloji_bias"
|
35 |
+
INSTRUCTION_FOLLOWING = "instruction_following_tr"
|
36 |
+
READING_COMPREHENSION = "reading_comprehension_mc"
|
37 |
+
READING_COMPREHENSION_OE = "reading_comp_oe"
|
38 |
+
COMMONSENSE_REASONING = "commonsense_reasoning"
|
39 |
+
COMPLEX_REASONING = "complex_reasoning"
|
40 |
+
TRUTHFULNESS = "sosyoloji_truthfulness"
|
41 |
+
NLI = "nli"
|
42 |
+
|
43 |
+
|
44 |
+
class DeepEvalTaskManager:
|
45 |
+
def __init__(self, model_name, tasks: List[str]):
|
46 |
+
self.model_name = model_name
|
47 |
+
self.available_tasks = {task.name: getattr(self, task.value) for task in Task}
|
48 |
+
self.tasks_to_run = self.validate_tasks(tasks)
|
49 |
+
|
50 |
+
def validate_tasks(self, user_tasks):
|
51 |
+
"""Validate user tasks and store method references."""
|
52 |
+
print(self.available_tasks.keys())
|
53 |
+
print(user_tasks)
|
54 |
+
if not set(user_tasks).issubset(self.available_tasks.keys()):
|
55 |
+
invalid_tasks = set(user_tasks) - self.available_tasks.keys()
|
56 |
+
raise ValueError(f"Invalid task(s) requested: {invalid_tasks}")
|
57 |
+
|
58 |
+
# Store actual method references instead of strings
|
59 |
+
return {task : self.available_tasks[task] for task in user_tasks}
|
60 |
+
|
61 |
+
def run_tasks(self):
|
62 |
+
"""Execute validated tasks in order."""
|
63 |
+
results = {}
|
64 |
+
for task_name, task_method in self.tasks_to_run.items():
|
65 |
+
print("Running task: ", task_name)
|
66 |
+
task_enum = getattr(Task, task_name)
|
67 |
+
task_value = task_enum.value
|
68 |
+
results[task_value] = task_method() # Call the stored method reference
|
69 |
+
|
70 |
+
return results
|
71 |
+
|
72 |
+
def sentiment_analysis_tr(self):
|
73 |
+
st_task = SentimentAnalysisTask(self.model_name)
|
74 |
+
res = st_task.evaluate()
|
75 |
+
return res
|
76 |
+
|
77 |
+
def turkish_general_knowledge(self):
|
78 |
+
turkish_general_knowledge_task = TurkishGeneralKnowledgeTask(self.model_name)
|
79 |
+
res = turkish_general_knowledge_task.evaluate()
|
80 |
+
return res
|
81 |
+
|
82 |
+
def summarization_tr(self):
|
83 |
+
summarization_task = SummarizationTask(self.model_name)
|
84 |
+
res = summarization_task.evaluate()
|
85 |
+
return res
|
86 |
+
|
87 |
+
def sosyoloji_faithfulness(self):
|
88 |
+
faithfulness_task = FaithfulnessTask(self.model_name)
|
89 |
+
res = faithfulness_task.evaluate()
|
90 |
+
return res
|
91 |
+
|
92 |
+
def sosyoloji_toxicity(self):
|
93 |
+
toxicity_task = ToxicityTask(self.model_name)
|
94 |
+
res = toxicity_task.evaluate()
|
95 |
+
return res
|
96 |
+
|
97 |
+
def sosyoloji_bias(self):
|
98 |
+
bias_task = BiasTask(self.model_name)
|
99 |
+
res = bias_task.evaluate()
|
100 |
+
return res
|
101 |
+
|
102 |
+
def instruction_following_tr(self):
|
103 |
+
instruction_following_task = InstructionFollowingTask(self.model_name)
|
104 |
+
res = instruction_following_task.evaluate()
|
105 |
+
return res
|
106 |
+
|
107 |
+
def reading_comprehension_mc(self):
|
108 |
+
reading_comprehension_mc_task = ReadingComprehensionMCTask(self.model_name)
|
109 |
+
res = reading_comprehension_mc_task.evaluate()
|
110 |
+
return res
|
111 |
+
|
112 |
+
def reading_comp_oe(self):
|
113 |
+
reading_comprehension_task = ReadingComprehensionTask(self.model_name)
|
114 |
+
res = reading_comprehension_task.evaluate()
|
115 |
+
return res
|
116 |
+
|
117 |
+
def commonsense_reasoning(self):
|
118 |
+
commonsense_reasoning_task = CommonsenseReasoningTask(self.model_name)
|
119 |
+
res = commonsense_reasoning_task.evaluate()
|
120 |
+
return res
|
121 |
+
|
122 |
+
def complex_reasoning(self):
|
123 |
+
complex_reasoning_task = ComplexReasoningTask(self.model_name)
|
124 |
+
res = complex_reasoning_task.evaluate()
|
125 |
+
return res
|
126 |
+
|
127 |
+
def sosyoloji_truthfulness(self):
|
128 |
+
truthfulness_task = TruthfulnessTask(self.model_name)
|
129 |
+
res = truthfulness_task.evaluate()
|
130 |
+
return res
|
131 |
+
|
132 |
+
def nli(self):
|
133 |
+
nli_task = NLITask(self.model_name)
|
134 |
+
res = nli_task.evaluate()
|
135 |
+
return res
|
136 |
+
|
137 |
+
if __name__ == "__main__":
|
138 |
+
des = DeepEvalTaskManager("meta-llama/Llama-3.2-1B-Instruct", ["COMPLEX_REASONING","NLI"])
|
139 |
+
res = des.run_tasks()
|
140 |
print(res)
|
src/deepeval/nli.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
|
6 |
+
|
7 |
+
class NLITask(BaseTask):
|
8 |
+
def __init__(self, model_name):
|
9 |
+
super().__init__("metunlp/nli_tr", model_name=model_name)
|
10 |
+
|
11 |
+
def load_dataset_from_hf(self):
|
12 |
+
dataset = super().load_dataset_from_hf()
|
13 |
+
return dataset.select(range(min(10, len(dataset))))
|
14 |
+
|
15 |
+
|
16 |
+
def evaluate(self) -> dict[str, Any]:
|
17 |
+
responses = []
|
18 |
+
difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
|
19 |
+
total_count = 0
|
20 |
+
true = 0
|
21 |
+
|
22 |
+
for row in self.dataset:
|
23 |
+
total_count += 1
|
24 |
+
|
25 |
+
# Get values from row
|
26 |
+
text = row["text"]
|
27 |
+
premise = row["premise"]
|
28 |
+
hypothesis = row["hypothesis"]
|
29 |
+
label = row["label"].lower().replace(' ','')
|
30 |
+
choices=["entailment","contradiction","neutral"]
|
31 |
+
formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
|
32 |
+
category = row["difficulty"]
|
33 |
+
correct_answer_letter = "A" if label == "entailment" else \
|
34 |
+
"B" if label == "contradiction" else \
|
35 |
+
"C" if label == "neutral" else None
|
36 |
+
|
37 |
+
|
38 |
+
# Prints for debugging
|
39 |
+
print(f"Choices: {choices}")
|
40 |
+
print("Type of choices:", type(choices))
|
41 |
+
print("Label:", label)
|
42 |
+
|
43 |
+
# Construct the prompt/message
|
44 |
+
instruction = ""
|
45 |
+
question = "Yukarıdaki cümleler arasındaki ilişki “entailment” (bir cümle diğerini ima eder), “neutral (cümleler birbirini ima etmez ve çelişmez) veya “contradiction (cümleler birbirleriyle çelişir) olarak karakterize edilebilir. Bu ilişkilerden hangisi olduğunu söyleyin."
|
46 |
+
context = f"Bağlam:\n{text}\n" # can add to prompt if needed
|
47 |
+
prompt = f"Cümle1:\n{premise}\nCümle2:{hypothesis}\nSoru:\n{question}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
48 |
+
message = prompt
|
49 |
+
|
50 |
+
# Get/format answer of the model
|
51 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=10)
|
52 |
+
responses.append(model_answer)
|
53 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
|
54 |
+
|
55 |
+
# Print answers
|
56 |
+
print(f"Correct Answer: {correct_answer_letter}")
|
57 |
+
print(f"Model Answer: {model_answer}")
|
58 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
59 |
+
|
60 |
+
# Check if correct based on metric
|
61 |
+
if correct_answer_letter == model_answer_cleaned:
|
62 |
+
true += 1
|
63 |
+
difficulty_results[category]['correct'] += 1
|
64 |
+
|
65 |
+
difficulty_results[category]['total'] += 1
|
66 |
+
|
67 |
+
# Print results categorized by difficulty
|
68 |
+
for category, stats in difficulty_results.items():
|
69 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
70 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
71 |
+
|
72 |
+
print("Results:", responses)
|
73 |
+
print("Overall Accuracy:", true / total_count)
|
74 |
+
acc = accuracy(true, total_count)
|
75 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
76 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
77 |
+
|
src/deepeval/reading_comp_mc.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import ast
|
6 |
+
|
7 |
+
|
8 |
+
class ReadingComprehensionMCTask(BaseTask):
|
9 |
+
def __init__(self, model_name):
|
10 |
+
super().__init__("metunlp/reading_comp_mc", model_name=model_name)
|
11 |
+
|
12 |
+
def load_dataset_from_hf(self):
|
13 |
+
dataset = super().load_dataset_from_hf()
|
14 |
+
return dataset.select(range(min(10, len(dataset))))
|
15 |
+
|
16 |
+
|
17 |
+
def evaluate(self) -> dict[str, Any]:
|
18 |
+
responses = []
|
19 |
+
difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
|
20 |
+
total_count = 0
|
21 |
+
true = 0
|
22 |
+
|
23 |
+
for row in self.dataset:
|
24 |
+
total_count += 1
|
25 |
+
|
26 |
+
# Get values from row
|
27 |
+
choices = ast.literal_eval(row["choices"]) # Convert string to list
|
28 |
+
formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
|
29 |
+
category = row["difficulty"].lower().replace(' ','')
|
30 |
+
answer = row["answer"]
|
31 |
+
text = row["text"]
|
32 |
+
question_about_the_text = row["question_about_the_text"]
|
33 |
+
|
34 |
+
# Prints for debugging
|
35 |
+
print(f"Choices: {choices}")
|
36 |
+
print("Type of choices:", type(choices))
|
37 |
+
print("Type of answer:", type(answer))
|
38 |
+
|
39 |
+
# Get answer index (starting from 0)
|
40 |
+
if type(answer) == int:
|
41 |
+
answer_index = answer
|
42 |
+
else:
|
43 |
+
answer_index = int(answer)
|
44 |
+
correct_answer_letter = chr(65 + answer_index)
|
45 |
+
|
46 |
+
|
47 |
+
# Construct the prompt/message
|
48 |
+
instruction = ""
|
49 |
+
prompt = f"Paragraf:\n{text}\nSoru:{question_about_the_text}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
50 |
+
message = prompt
|
51 |
+
|
52 |
+
# Get/format answer of the model
|
53 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
54 |
+
responses.append(model_answer)
|
55 |
+
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
56 |
+
|
57 |
+
# Print answers
|
58 |
+
print(f"Correct Answer: {correct_answer_letter}")
|
59 |
+
print(f"Model Answer: {model_answer}")
|
60 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
61 |
+
|
62 |
+
# Check if correct based on metric
|
63 |
+
if correct_answer_letter == model_answer_cleaned:
|
64 |
+
true += 1
|
65 |
+
difficulty_results[category]['correct'] += 1
|
66 |
+
|
67 |
+
difficulty_results[category]['total'] += 1
|
68 |
+
|
69 |
+
# Print results categorized by difficulty
|
70 |
+
for category, stats in difficulty_results.items():
|
71 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
72 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
73 |
+
|
74 |
+
print("Results:", responses)
|
75 |
+
print("Overall Accuracy:", true / total_count)
|
76 |
+
acc = accuracy(true, total_count)
|
77 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
78 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
79 |
+
|
src/deepeval/sentiment_analysis_task.py
CHANGED
@@ -7,6 +7,7 @@ class SentimentAnalysisTask(BaseTask):
|
|
7 |
super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name)
|
8 |
|
9 |
def load_dataset_from_hf(self):
|
|
|
10 |
dataset = super().load_dataset_from_hf()
|
11 |
return dataset.select(range(min(10, len(dataset))))
|
12 |
|
|
|
7 |
super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name)
|
8 |
|
9 |
def load_dataset_from_hf(self):
|
10 |
+
print("Loading the dataset")
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
return dataset.select(range(min(10, len(dataset))))
|
13 |
|
src/deepeval/turkish_general_knowledge_task.py
CHANGED
@@ -9,7 +9,7 @@ class TurkishGeneralKnowledgeTask(BaseTask):
|
|
9 |
|
10 |
def load_dataset_from_hf(self):
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
-
return dataset.select(range(min(
|
13 |
|
14 |
def evaluate(self):
|
15 |
responses = []
|
|
|
9 |
|
10 |
def load_dataset_from_hf(self):
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
+
return dataset.select(range(min(1, len(dataset))))
|
13 |
|
14 |
def evaluate(self):
|
15 |
responses = []
|
svc/router.py
CHANGED
@@ -43,6 +43,10 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(
|
|
43 |
async def protected_route(username: str = Depends(get_current_user)):
|
44 |
return {"message": f"Hello, {username}! This is a protected resource."}
|
45 |
|
|
|
|
|
|
|
|
|
46 |
|
47 |
@router.post("/chat", response_model=TaskResponse)
|
48 |
def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_current_user)):
|
@@ -78,7 +82,6 @@ def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_c
|
|
78 |
return TaskResponse(results=dumped)
|
79 |
|
80 |
|
81 |
-
|
82 |
@router.post("/deepeval/eval", response_model=TaskResponse)
|
83 |
async def deep_eval_suite(request: DeepEvalSuiteRequest):
|
84 |
des = DeepEvalTaskManager(request.model_name, request.tasks)
|
|
|
43 |
async def protected_route(username: str = Depends(get_current_user)):
|
44 |
return {"message": f"Hello, {username}! This is a protected resource."}
|
45 |
|
46 |
+
@router.get("/deepeval/status")
|
47 |
+
async def deep_eval_status():
|
48 |
+
#Return running with 200 status code
|
49 |
+
return {"status": "running"}
|
50 |
|
51 |
@router.post("/chat", response_model=TaskResponse)
|
52 |
def inference_model(request: LMHarnessTaskRequest, username: str = Depends(get_current_user)):
|
|
|
82 |
return TaskResponse(results=dumped)
|
83 |
|
84 |
|
|
|
85 |
@router.post("/deepeval/eval", response_model=TaskResponse)
|
86 |
async def deep_eval_suite(request: DeepEvalSuiteRequest):
|
87 |
des = DeepEvalTaskManager(request.model_name, request.tasks)
|