Spaces:
Running
on
T4
Running
on
T4
Add math
Browse files
src/deepeval/base_task.py
CHANGED
@@ -71,6 +71,52 @@ class BaseTask(ABC):
|
|
71 |
answer = self.tokenizer.decode(output[0][-1])
|
72 |
|
73 |
return answer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
def generate_response_mcqa_multi_token(self, msg, max_new_tokens=5, choices: list = []):
|
76 |
"""
|
|
|
71 |
answer = self.tokenizer.decode(output[0][-1])
|
72 |
|
73 |
return answer
|
74 |
+
|
75 |
+
def generate_response_oeqa_multi_token(self, msg, max_new_tokens=-1, choices: list = []):
|
76 |
+
"""
|
77 |
+
Handles multiple-choice questions where answers might have multiple tokens.
|
78 |
+
"""
|
79 |
+
# Ensure tokenizer has proper special tokens set
|
80 |
+
if self.tokenizer.pad_token is None:
|
81 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
82 |
+
|
83 |
+
if self.model.config.pad_token_id is None:
|
84 |
+
self.model.config.pad_token_id = self.tokenizer.pad_token_id
|
85 |
+
|
86 |
+
chat = [
|
87 |
+
{"role": "user", "content": "You are a question-answering chatbot."},
|
88 |
+
{"role": "assistant", "content": "I am ready to answer your questions. Feel free to ask anything.\n"},
|
89 |
+
{"role": "user", "content": f"{msg}"},
|
90 |
+
]
|
91 |
+
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
92 |
+
print(formatted_chat)
|
93 |
+
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
94 |
+
input_ids = inputs.input_ids.to(self.model.device)
|
95 |
+
attention_mask = inputs.attention_mask.to(self.model.device)
|
96 |
+
|
97 |
+
# Custom LogitsProcessor to restrict generation
|
98 |
+
class RestrictToABCDLogitsProcessor(LogitsProcessor):
|
99 |
+
def __call__(self, input_ids, scores):
|
100 |
+
mask = torch.full_like(scores, float("-inf")) # Block all tokens
|
101 |
+
return mask
|
102 |
+
logits_processor = LogitsProcessorList([RestrictToABCDLogitsProcessor()])
|
103 |
+
|
104 |
+
# Generate response
|
105 |
+
output = self.model.generate(
|
106 |
+
input_ids,
|
107 |
+
do_sample=True,
|
108 |
+
attention_mask=attention_mask,
|
109 |
+
#max_new_tokens=max_new_tokens,
|
110 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
111 |
+
pad_token_id=self.tokenizer.pad_token_id,
|
112 |
+
temperature=0.4,
|
113 |
+
#logits_processor=logits_processor,
|
114 |
+
)
|
115 |
+
generated_ids = output[0] # The generated sequence including the prompt
|
116 |
+
generated_tokens = generated_ids[len(input_ids[0]):] # Exclude the input_ids part
|
117 |
+
generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
118 |
+
return generated_text
|
119 |
+
|
120 |
|
121 |
def generate_response_mcqa_multi_token(self, msg, max_new_tokens=5, choices: list = []):
|
122 |
"""
|
src/deepeval/deepeval_task_manager.py
CHANGED
@@ -13,6 +13,7 @@ from src.deepeval.reading_comprehension_task import ReadingComprehensionTask
|
|
13 |
from src.deepeval.reading_comp_mc import ReadingComprehensionMCTask
|
14 |
from src.deepeval.complex_reasoning import ComplexReasoningTask
|
15 |
from src.deepeval.nli import NLITask
|
|
|
16 |
from typing import List
|
17 |
load_dotenv()
|
18 |
|
@@ -37,6 +38,7 @@ class Task(Enum):
|
|
37 |
READING_COMPREHENSION_MC = "reading_comprehension_mc"
|
38 |
COMPLEX_REASONING = "complex_reasoning"
|
39 |
NLI = "nli"
|
|
|
40 |
|
41 |
|
42 |
class DeepEvalTaskManager:
|
@@ -121,7 +123,12 @@ class DeepEvalTaskManager:
|
|
121 |
res = nli_task.evaluate()
|
122 |
return res
|
123 |
|
|
|
|
|
|
|
|
|
|
|
124 |
if __name__ == "__main__":
|
125 |
-
des = DeepEvalTaskManager("
|
126 |
res = des.run_tasks()
|
127 |
print(res)
|
|
|
13 |
from src.deepeval.reading_comp_mc import ReadingComprehensionMCTask
|
14 |
from src.deepeval.complex_reasoning import ComplexReasoningTask
|
15 |
from src.deepeval.nli import NLITask
|
16 |
+
from src.deepeval.math import MathTask
|
17 |
from typing import List
|
18 |
load_dotenv()
|
19 |
|
|
|
38 |
READING_COMPREHENSION_MC = "reading_comprehension_mc"
|
39 |
COMPLEX_REASONING = "complex_reasoning"
|
40 |
NLI = "nli"
|
41 |
+
MATH = "math"
|
42 |
|
43 |
|
44 |
class DeepEvalTaskManager:
|
|
|
123 |
res = nli_task.evaluate()
|
124 |
return res
|
125 |
|
126 |
+
def math(self):
|
127 |
+
math_task = MathTask(self.model_name)
|
128 |
+
res = math_task.evaluate()
|
129 |
+
return res
|
130 |
+
|
131 |
if __name__ == "__main__":
|
132 |
+
des = DeepEvalTaskManager("google/gemma", ["MATH"])
|
133 |
res = des.run_tasks()
|
134 |
print(res)
|
src/deepeval/math.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.deepeval.base_task import BaseTask
|
2 |
+
from collections import defaultdict
|
3 |
+
from src.deepeval.utils import accuracy, accuracy_standard_error
|
4 |
+
from typing import Any
|
5 |
+
import re
|
6 |
+
|
7 |
+
class MathTask(BaseTask):
|
8 |
+
def __init__(self, model_name):
|
9 |
+
super().__init__("metunlp/math_tr", model_name=model_name)
|
10 |
+
|
11 |
+
def load_dataset_from_hf(self):
|
12 |
+
dataset = super().load_dataset_from_hf()
|
13 |
+
return dataset.select(range(min(10, len(dataset))))
|
14 |
+
|
15 |
+
|
16 |
+
def evaluate(self) -> dict[str, Any]:
|
17 |
+
responses = []
|
18 |
+
difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
|
19 |
+
total_count = 0
|
20 |
+
true = 0
|
21 |
+
|
22 |
+
for row in self.dataset:
|
23 |
+
total_count += 1
|
24 |
+
|
25 |
+
# Get values from row
|
26 |
+
category = str(row["difficulty"])
|
27 |
+
answer = row["final_answer"]
|
28 |
+
|
29 |
+
# Prints for debugging
|
30 |
+
print(f"Answer: {answer}")
|
31 |
+
print("Type of answer:", type(answer))
|
32 |
+
|
33 |
+
# Construct the prompt/message
|
34 |
+
instruction = f"""Aşağıdaki matematik problemini verilen nihai cevap formatına uygun olacak şekilde çözün. Tüm adımları gösterdikten sonra, nihai cevabınızı sadece bir kez ve aşağıdaki kurallara uygun şekilde kutu içinde verin.
|
35 |
+
|
36 |
+
Nihai Cevap için Uyulması Gereken Format Kuralları:
|
37 |
+
|
38 |
+
1. Nihai cevap, tek seferde \\boxed{{...}} içinde verilmeli. Örnek: Cevap 1 ise, "\\boxed{{1}}".
|
39 |
+
2. Kesirleri her zaman en sade halde verilmeli.
|
40 |
+
- Matris içi kesirler: x/y biçiminde.
|
41 |
+
- Diğer tüm kesirler: \\frac{{x}}{{y}} biçiminde.
|
42 |
+
3. Çarpma işareti (*) kullanmayın. Örnek: 2x yazın, 2**x* değil.
|
43 |
+
4. Birden çok değişken varsa alfabetik sıraya uyulmalı ve (x, y, z...), polinomları azalan derece sırasına göre yazılmalı.
|
44 |
+
5. Her zaman aynı gösterim biçimi kullanılmalı. Ondalık yerine kesir kullanılmalı (ör. 0.5 yerine \\frac{{1}}{{2}} ).
|
45 |
+
6. Faktörize polinomlar daima aynı faktör sırası ile verilsin; her sorguda aynı cevabı verecek şekilde tutarlılığı koruyun.
|
46 |
+
7. Nihai cevabı kutu dışında tekrar etmeyin, biçimi değiştirmeyin. Aynı soru tekrarlandığında aynı formatı ve cevabı verin.
|
47 |
+
|
48 |
+
Görev: Problemi çözün, son adımda yukarıdaki kurallara tam uyan tek bir kutu içinde nihai cevabı verin.
|
49 |
+
|
50 |
+
"""
|
51 |
+
prompt = f"{instruction}\n\nSoru:\n{row["question"]}\n"
|
52 |
+
message = prompt
|
53 |
+
|
54 |
+
# Get/format answer of the model
|
55 |
+
model_answer = self.generate_response_oeqa_multi_token(message)
|
56 |
+
responses.append(model_answer)
|
57 |
+
model_answer_cleaned = re.search(r"\\boxed{([^}]*)}", model_answer)
|
58 |
+
|
59 |
+
# Print answers
|
60 |
+
print(f"Correct Answer: {answer}")
|
61 |
+
print(f"Model Answer: {model_answer}")
|
62 |
+
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
63 |
+
print(f"Result: {answer == model_answer_cleaned}")
|
64 |
+
|
65 |
+
# Check if correct based on metric
|
66 |
+
if answer == model_answer_cleaned:
|
67 |
+
true += 1
|
68 |
+
difficulty_results[category]['correct'] += 1
|
69 |
+
|
70 |
+
difficulty_results[category]['total'] += 1
|
71 |
+
|
72 |
+
# Print results categorized by difficulty
|
73 |
+
for category, stats in difficulty_results.items():
|
74 |
+
calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
|
75 |
+
print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")
|
76 |
+
|
77 |
+
print("Results:", responses)
|
78 |
+
print("Overall Accuracy:", true / total_count)
|
79 |
+
acc = accuracy(true, total_count)
|
80 |
+
acc_stderr = accuracy_standard_error(acc, total_count)
|
81 |
+
return {"acc": acc, "acc_stderr": acc_stderr}
|
82 |
+
|
src/deepeval/nli.py
CHANGED
@@ -41,11 +41,11 @@ class NLITask(BaseTask):
|
|
41 |
instruction = ""
|
42 |
question = "Yukarıdaki cümleler arasındaki ilişki “entailment” (bir cümle diğerini ima eder), “neutral (cümleler birbirini ima etmez ve çelişmez) veya “contradiction (cümleler birbirleriyle çelişir) olarak karakterize edilebilir. Bu ilişkilerden hangisi olduğunu söyleyin."
|
43 |
context = f"Bağlam:\n{row["text"]}\n" # can add to prompt if needed
|
44 |
-
prompt = f"Cümle1
|
45 |
message = prompt
|
46 |
|
47 |
# Get/format answer of the model
|
48 |
-
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=
|
49 |
responses.append(model_answer)
|
50 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
|
51 |
|
|
|
41 |
instruction = ""
|
42 |
question = "Yukarıdaki cümleler arasındaki ilişki “entailment” (bir cümle diğerini ima eder), “neutral (cümleler birbirini ima etmez ve çelişmez) veya “contradiction (cümleler birbirleriyle çelişir) olarak karakterize edilebilir. Bu ilişkilerden hangisi olduğunu söyleyin."
|
43 |
context = f"Bağlam:\n{row["text"]}\n" # can add to prompt if needed
|
44 |
+
prompt = f"Cümle1: {row["premise"]}\nCümle2: {row["hypothesis"]}\nSoru:\n{question}\nSeçenekler:\n{formatted_choices}\n{instruction}\n"
|
45 |
message = prompt
|
46 |
|
47 |
# Get/format answer of the model
|
48 |
+
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
49 |
responses.append(model_answer)
|
50 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
|
51 |
|
src/deepeval/reading_comp_mc.py
CHANGED
@@ -56,6 +56,7 @@ class ReadingComprehensionMCTask(BaseTask):
|
|
56 |
print(f"Correct Answer: {correct_answer_letter}")
|
57 |
print(f"Model Answer: {model_answer}")
|
58 |
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
|
|
59 |
|
60 |
# Check if correct based on metric
|
61 |
if correct_answer_letter == model_answer_cleaned:
|
|
|
56 |
print(f"Correct Answer: {correct_answer_letter}")
|
57 |
print(f"Model Answer: {model_answer}")
|
58 |
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
59 |
+
print(f"Result: {correct_answer_letter == model_answer_cleaned}")
|
60 |
|
61 |
# Check if correct based on metric
|
62 |
if correct_answer_letter == model_answer_cleaned:
|
src/deepeval/turkish_general_knowledge_task.py
CHANGED
@@ -9,7 +9,7 @@ class TurkishGeneralKnowledgeTask(BaseTask):
|
|
9 |
|
10 |
def load_dataset_from_hf(self):
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
-
return dataset.select(range(min(
|
13 |
|
14 |
def evaluate(self):
|
15 |
responses = []
|
|
|
9 |
|
10 |
def load_dataset_from_hf(self):
|
11 |
dataset = super().load_dataset_from_hf()
|
12 |
+
return dataset.select(range(min(10, len(dataset))))
|
13 |
|
14 |
def evaluate(self):
|
15 |
responses = []
|