Spaces:
Paused
Paused
Ahmet Kaan Sever
commited on
Commit
·
8a3d32e
1
Parent(s):
66a11b3
Removed unnecessary debug prints and timestamps now return seconds.
Browse files- src/deepeval/base_task.py +4 -4
- src/deepeval/commonsense_reasoning_task.py +6 -6
- src/deepeval/complex_reasoning.py +5 -5
- src/deepeval/deepeval_task_manager.py +2 -2
- src/deepeval/nli.py +6 -6
- src/deepeval/reading_comp_mc.py +6 -6
- src/deepeval/sentiment_analysis_task.py +1 -1
- src/deepeval/summarization_task.py +4 -4
- src/deepeval/turkish_general_knowledge_task.py +5 -5
src/deepeval/base_task.py
CHANGED
@@ -41,7 +41,7 @@ class BaseTask(ABC):
|
|
41 |
token=HF_TOKEN, # Replace with actual token
|
42 |
)
|
43 |
end_time = datetime.now()
|
44 |
-
print(f"Model loaded in {end_time - start_time} seconds.")
|
45 |
print("Model loaded.")
|
46 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
47 |
return model, tokenizer
|
@@ -98,7 +98,7 @@ class BaseTask(ABC):
|
|
98 |
{"role": "user", "content": f"{msg}"},
|
99 |
]
|
100 |
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
101 |
-
print(formatted_chat)
|
102 |
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
103 |
input_ids = inputs.input_ids.to(self.model.device)
|
104 |
attention_mask = inputs.attention_mask.to(self.model.device)
|
@@ -107,7 +107,7 @@ class BaseTask(ABC):
|
|
107 |
letters = [chr(ord('A') + i) for i in range(len(choices))] # Create option letters A, B, C, D, E, ...
|
108 |
encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
|
109 |
flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list
|
110 |
-
print(flattened_encoded_choices)
|
111 |
|
112 |
allowed_tokens = flattened_encoded_choices
|
113 |
allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
|
@@ -199,7 +199,7 @@ class BaseTask(ABC):
|
|
199 |
dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * 0.25)))
|
200 |
print("Reduced dataset size: ", len(dataset))
|
201 |
end_time = datetime.now()
|
202 |
-
print(f"Dataset loaded in {end_time - start_time} seconds.")
|
203 |
return dataset
|
204 |
|
205 |
@abstractmethod
|
|
|
41 |
token=HF_TOKEN, # Replace with actual token
|
42 |
)
|
43 |
end_time = datetime.now()
|
44 |
+
print(f"Model loaded in {(end_time - start_time).seconds} seconds.")
|
45 |
print("Model loaded.")
|
46 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
47 |
return model, tokenizer
|
|
|
98 |
{"role": "user", "content": f"{msg}"},
|
99 |
]
|
100 |
formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
101 |
+
#print(formatted_chat)
|
102 |
inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
|
103 |
input_ids = inputs.input_ids.to(self.model.device)
|
104 |
attention_mask = inputs.attention_mask.to(self.model.device)
|
|
|
107 |
letters = [chr(ord('A') + i) for i in range(len(choices))] # Create option letters A, B, C, D, E, ...
|
108 |
encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
|
109 |
flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list
|
110 |
+
#print(flattened_encoded_choices)
|
111 |
|
112 |
allowed_tokens = flattened_encoded_choices
|
113 |
allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
|
|
|
199 |
dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * 0.25)))
|
200 |
print("Reduced dataset size: ", len(dataset))
|
201 |
end_time = datetime.now()
|
202 |
+
print(f"Dataset loaded in {(end_time - start_time).seconds} seconds.")
|
203 |
return dataset
|
204 |
|
205 |
@abstractmethod
|
src/deepeval/commonsense_reasoning_task.py
CHANGED
@@ -32,9 +32,9 @@ class CommonsenseReasoningTask(BaseTask):
|
|
32 |
context = row["context"]
|
33 |
|
34 |
# Prints for debugging
|
35 |
-
print(f"Choices: {choices}")
|
36 |
-
print("Type of choices:", type(choices))
|
37 |
-
print("Type of answer:", type(answer))
|
38 |
|
39 |
# Get answer index (starting from 0)
|
40 |
if type(answer) == int:
|
@@ -62,9 +62,9 @@ class CommonsenseReasoningTask(BaseTask):
|
|
62 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
|
63 |
|
64 |
# Print answers
|
65 |
-
print(f"Correct Answer: {correct_answer_letter}")
|
66 |
-
print(f"Model Answer: {model_answer}")
|
67 |
-
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
68 |
|
69 |
# Check if correct based on metric
|
70 |
if correct_answer_letter == model_answer_cleaned:
|
|
|
32 |
context = row["context"]
|
33 |
|
34 |
# Prints for debugging
|
35 |
+
# print(f"Choices: {choices}")
|
36 |
+
# print("Type of choices:", type(choices))
|
37 |
+
# print("Type of answer:", type(answer))
|
38 |
|
39 |
# Get answer index (starting from 0)
|
40 |
if type(answer) == int:
|
|
|
62 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
|
63 |
|
64 |
# Print answers
|
65 |
+
# print(f"Correct Answer: {correct_answer_letter}")
|
66 |
+
# print(f"Model Answer: {model_answer}")
|
67 |
+
# print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
68 |
|
69 |
# Check if correct based on metric
|
70 |
if correct_answer_letter == model_answer_cleaned:
|
src/deepeval/complex_reasoning.py
CHANGED
@@ -33,8 +33,8 @@ class ComplexReasoningTask(BaseTask):
|
|
33 |
correct_answers.append(correct_answer_letter)
|
34 |
|
35 |
# Prints for debugging
|
36 |
-
print(f"Choices: {choices}")
|
37 |
-
print("Type of choices:", type(choices))
|
38 |
|
39 |
|
40 |
# Construct the prompt/message
|
@@ -50,9 +50,9 @@ class ComplexReasoningTask(BaseTask):
|
|
50 |
if correct_answer_letter == model_answer_cleaned:
|
51 |
true += 1
|
52 |
# Print answers
|
53 |
-
print(f"Correct Answer: {correct_answer_letter}")
|
54 |
-
print(f"Model Answer: {model_answer}")
|
55 |
-
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
56 |
|
57 |
print("Answers:", correct_answers)
|
58 |
print("Results:", responses)
|
|
|
33 |
correct_answers.append(correct_answer_letter)
|
34 |
|
35 |
# Prints for debugging
|
36 |
+
# print(f"Choices: {choices}")
|
37 |
+
# print("Type of choices:", type(choices))
|
38 |
|
39 |
|
40 |
# Construct the prompt/message
|
|
|
50 |
if correct_answer_letter == model_answer_cleaned:
|
51 |
true += 1
|
52 |
# Print answers
|
53 |
+
# print(f"Correct Answer: {correct_answer_letter}")
|
54 |
+
# print(f"Model Answer: {model_answer}")
|
55 |
+
# print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
56 |
|
57 |
print("Answers:", correct_answers)
|
58 |
print("Results:", responses)
|
src/deepeval/deepeval_task_manager.py
CHANGED
@@ -69,12 +69,12 @@ class DeepEvalTaskManager:
|
|
69 |
task_value = task_enum.value
|
70 |
results[task_value] = task_method() # Call the stored method reference
|
71 |
end_time = datetime.now()
|
72 |
-
print(f"Task {task_name} completed in {end_time - start_time} seconds.")
|
73 |
except Exception as e:
|
74 |
print(f"Error At Task: {task_name} - {e}")
|
75 |
continue
|
76 |
total_end_time = datetime.now()
|
77 |
-
print(f"All tasks completed in {total_end_time - total_start_time} seconds.")
|
78 |
print("All tasks completed.")
|
79 |
return results
|
80 |
|
|
|
69 |
task_value = task_enum.value
|
70 |
results[task_value] = task_method() # Call the stored method reference
|
71 |
end_time = datetime.now()
|
72 |
+
print(f"Task {task_name} completed in {(end_time - start_time).seconds} seconds.")
|
73 |
except Exception as e:
|
74 |
print(f"Error At Task: {task_name} - {e}")
|
75 |
continue
|
76 |
total_end_time = datetime.now()
|
77 |
+
print(f"All tasks completed in {(total_end_time - total_start_time).seconds} seconds.")
|
78 |
print("All tasks completed.")
|
79 |
return results
|
80 |
|
src/deepeval/nli.py
CHANGED
@@ -36,9 +36,9 @@ class NLITask(BaseTask):
|
|
36 |
|
37 |
|
38 |
# Prints for debugging
|
39 |
-
print(f"Choices: {choices}")
|
40 |
-
print("Type of choices:", type(choices))
|
41 |
-
print("Label:", label)
|
42 |
|
43 |
# Construct the prompt/message
|
44 |
instruction = ""
|
@@ -53,9 +53,9 @@ class NLITask(BaseTask):
|
|
53 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
|
54 |
|
55 |
# Print answers
|
56 |
-
print(f"Correct Answer: {correct_answer_letter}")
|
57 |
-
print(f"Model Answer: {model_answer}")
|
58 |
-
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
59 |
|
60 |
# Check if correct based on metric
|
61 |
if correct_answer_letter == model_answer_cleaned:
|
|
|
36 |
|
37 |
|
38 |
# Prints for debugging
|
39 |
+
# print(f"Choices: {choices}")
|
40 |
+
# print("Type of choices:", type(choices))
|
41 |
+
# print("Label:", label)
|
42 |
|
43 |
# Construct the prompt/message
|
44 |
instruction = ""
|
|
|
53 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
|
54 |
|
55 |
# Print answers
|
56 |
+
# print(f"Correct Answer: {correct_answer_letter}")
|
57 |
+
# print(f"Model Answer: {model_answer}")
|
58 |
+
# print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
59 |
|
60 |
# Check if correct based on metric
|
61 |
if correct_answer_letter == model_answer_cleaned:
|
src/deepeval/reading_comp_mc.py
CHANGED
@@ -32,9 +32,9 @@ class ReadingComprehensionMCTask(BaseTask):
|
|
32 |
question_about_the_text = row["question_about_the_text"]
|
33 |
|
34 |
# Prints for debugging
|
35 |
-
print(f"Choices: {choices}")
|
36 |
-
print("Type of choices:", type(choices))
|
37 |
-
print("Type of answer:", type(answer))
|
38 |
|
39 |
# Get answer index (starting from 0)
|
40 |
if type(answer) == int:
|
@@ -57,9 +57,9 @@ class ReadingComprehensionMCTask(BaseTask):
|
|
57 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
58 |
|
59 |
# Print answers
|
60 |
-
print(f"Correct Answer: {correct_answer_letter}")
|
61 |
-
print(f"Model Answer: {model_answer}")
|
62 |
-
print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
63 |
|
64 |
# Check if correct based on metric
|
65 |
if correct_answer_letter == model_answer_cleaned:
|
|
|
32 |
question_about_the_text = row["question_about_the_text"]
|
33 |
|
34 |
# Prints for debugging
|
35 |
+
# print(f"Choices: {choices}")
|
36 |
+
# print("Type of choices:", type(choices))
|
37 |
+
# print("Type of answer:", type(answer))
|
38 |
|
39 |
# Get answer index (starting from 0)
|
40 |
if type(answer) == int:
|
|
|
57 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
|
58 |
|
59 |
# Print answers
|
60 |
+
# print(f"Correct Answer: {correct_answer_letter}")
|
61 |
+
# print(f"Model Answer: {model_answer}")
|
62 |
+
# print(f"Model Answer Cleaned: {model_answer_cleaned}")
|
63 |
|
64 |
# Check if correct based on metric
|
65 |
if correct_answer_letter == model_answer_cleaned:
|
src/deepeval/sentiment_analysis_task.py
CHANGED
@@ -23,7 +23,7 @@ class SentimentAnalysisTask(BaseTask):
|
|
23 |
prompt = f"Verilen metin hangi duyguyu ifade ediyor? {sentence}\n {formatted_choices}"
|
24 |
messages = prompt
|
25 |
answer = self.generate_response_mcqa_multi_token(messages, choices=choices)
|
26 |
-
print("Answer:", answer)
|
27 |
responses.append(answer)
|
28 |
correct_answer_letter = "A" if row["sentiment"] == "positive" else "B" if row["sentiment"] == "negative" else "C" if row["sentiment"] == "neutral" else None
|
29 |
model_answer_cleaned = answer.strip().replace('\n', '').replace(' ', '').upper()
|
|
|
23 |
prompt = f"Verilen metin hangi duyguyu ifade ediyor? {sentence}\n {formatted_choices}"
|
24 |
messages = prompt
|
25 |
answer = self.generate_response_mcqa_multi_token(messages, choices=choices)
|
26 |
+
#print("Answer:", answer)
|
27 |
responses.append(answer)
|
28 |
correct_answer_letter = "A" if row["sentiment"] == "positive" else "B" if row["sentiment"] == "negative" else "C" if row["sentiment"] == "neutral" else None
|
29 |
model_answer_cleaned = answer.strip().replace('\n', '').replace(' ', '').upper()
|
src/deepeval/summarization_task.py
CHANGED
@@ -23,8 +23,8 @@ class SummarizationTask(BaseTask):
|
|
23 |
)
|
24 |
|
25 |
generated_summary = self.generate_response(prompt, max_new_tokens=200)
|
26 |
-
print(f"Text: {text_data}\n")
|
27 |
-
print(f"Summary: {generated_summary}\n")
|
28 |
test_case = LLMTestCase(input=text_data, actual_output=generated_summary)
|
29 |
|
30 |
metric = SummarizationMetric(
|
@@ -33,8 +33,8 @@ class SummarizationTask(BaseTask):
|
|
33 |
)
|
34 |
metric.measure(test_case)
|
35 |
|
36 |
-
print(f"Reason: {metric.reason}")
|
37 |
-
print(f"Score Breakdown: {metric.score_breakdown}")
|
38 |
results.append({
|
39 |
"index": i,
|
40 |
"score": metric.score,
|
|
|
23 |
)
|
24 |
|
25 |
generated_summary = self.generate_response(prompt, max_new_tokens=200)
|
26 |
+
# print(f"Text: {text_data}\n")
|
27 |
+
# print(f"Summary: {generated_summary}\n")
|
28 |
test_case = LLMTestCase(input=text_data, actual_output=generated_summary)
|
29 |
|
30 |
metric = SummarizationMetric(
|
|
|
33 |
)
|
34 |
metric.measure(test_case)
|
35 |
|
36 |
+
# print(f"Reason: {metric.reason}")
|
37 |
+
# print(f"Score Breakdown: {metric.score_breakdown}")
|
38 |
results.append({
|
39 |
"index": i,
|
40 |
"score": metric.score,
|
src/deepeval/turkish_general_knowledge_task.py
CHANGED
@@ -24,8 +24,8 @@ class TurkishGeneralKnowledgeTask(BaseTask):
|
|
24 |
answer_index = row["answer"] # Assuming it's zero-based index
|
25 |
difficulty = row["difficulty"]
|
26 |
|
27 |
-
print(f"Choices: {choices}")
|
28 |
-
print("Type of choices:", type(choices))
|
29 |
# Categorize difficulty
|
30 |
if difficulty <= 3:
|
31 |
category = 'easy'
|
@@ -44,15 +44,15 @@ class TurkishGeneralKnowledgeTask(BaseTask):
|
|
44 |
#"""
|
45 |
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
46 |
responses.append(model_answer)
|
47 |
-
print(f"Correct Answer: {choices[answer_index]}")
|
48 |
-
print(f"Model Answer: {model_answer}")
|
49 |
|
50 |
#TODO: Make the cleaning in the mcqa function
|
51 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
|
52 |
|
53 |
# Check if the answer is correct
|
54 |
correct_answer_letter = chr(65 + answer_index)
|
55 |
-
print("Correct Answer Letter:", correct_answer_letter)
|
56 |
|
57 |
if correct_answer_letter == model_answer_cleaned:
|
58 |
true += 1
|
|
|
24 |
answer_index = row["answer"] # Assuming it's zero-based index
|
25 |
difficulty = row["difficulty"]
|
26 |
|
27 |
+
# print(f"Choices: {choices}")
|
28 |
+
# print("Type of choices:", type(choices))
|
29 |
# Categorize difficulty
|
30 |
if difficulty <= 3:
|
31 |
category = 'easy'
|
|
|
44 |
#"""
|
45 |
model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
|
46 |
responses.append(model_answer)
|
47 |
+
# print(f"Correct Answer: {choices[answer_index]}")
|
48 |
+
# print(f"Model Answer: {model_answer}")
|
49 |
|
50 |
#TODO: Make the cleaning in the mcqa function
|
51 |
model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
|
52 |
|
53 |
# Check if the answer is correct
|
54 |
correct_answer_letter = chr(65 + answer_index)
|
55 |
+
# print("Correct Answer Letter:", correct_answer_letter)
|
56 |
|
57 |
if correct_answer_letter == model_answer_cleaned:
|
58 |
true += 1
|