Ahmet Kaan Sever commited on
Commit
8a3d32e
·
1 Parent(s): 66a11b3

Removed unnecessary debug prints and timestamps now return seconds.

Browse files
src/deepeval/base_task.py CHANGED
@@ -41,7 +41,7 @@ class BaseTask(ABC):
41
  token=HF_TOKEN, # Replace with actual token
42
  )
43
  end_time = datetime.now()
44
- print(f"Model loaded in {end_time - start_time} seconds.")
45
  print("Model loaded.")
46
  tokenizer = AutoTokenizer.from_pretrained(model_name)
47
  return model, tokenizer
@@ -98,7 +98,7 @@ class BaseTask(ABC):
98
  {"role": "user", "content": f"{msg}"},
99
  ]
100
  formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
101
- print(formatted_chat)
102
  inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
103
  input_ids = inputs.input_ids.to(self.model.device)
104
  attention_mask = inputs.attention_mask.to(self.model.device)
@@ -107,7 +107,7 @@ class BaseTask(ABC):
107
  letters = [chr(ord('A') + i) for i in range(len(choices))] # Create option letters A, B, C, D, E, ...
108
  encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
109
  flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list
110
- print(flattened_encoded_choices)
111
 
112
  allowed_tokens = flattened_encoded_choices
113
  allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
@@ -199,7 +199,7 @@ class BaseTask(ABC):
199
  dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * 0.25)))
200
  print("Reduced dataset size: ", len(dataset))
201
  end_time = datetime.now()
202
- print(f"Dataset loaded in {end_time - start_time} seconds.")
203
  return dataset
204
 
205
  @abstractmethod
 
41
  token=HF_TOKEN, # Replace with actual token
42
  )
43
  end_time = datetime.now()
44
+ print(f"Model loaded in {(end_time - start_time).seconds} seconds.")
45
  print("Model loaded.")
46
  tokenizer = AutoTokenizer.from_pretrained(model_name)
47
  return model, tokenizer
 
98
  {"role": "user", "content": f"{msg}"},
99
  ]
100
  formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
101
+ #print(formatted_chat)
102
  inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
103
  input_ids = inputs.input_ids.to(self.model.device)
104
  attention_mask = inputs.attention_mask.to(self.model.device)
 
107
  letters = [chr(ord('A') + i) for i in range(len(choices))] # Create option letters A, B, C, D, E, ...
108
  encoded_choices = [self.tokenizer.encode(letter, add_special_tokens=False) for letter in letters]
109
  flattened_encoded_choices = [item for sublist in encoded_choices for item in sublist] # Flatten the list
110
+ #print(flattened_encoded_choices)
111
 
112
  allowed_tokens = flattened_encoded_choices
113
  allowed_tokens += self.get_chat_template_tokens() # Get the special chat tokens
 
199
  dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * 0.25)))
200
  print("Reduced dataset size: ", len(dataset))
201
  end_time = datetime.now()
202
+ print(f"Dataset loaded in {(end_time - start_time).seconds} seconds.")
203
  return dataset
204
 
205
  @abstractmethod
src/deepeval/commonsense_reasoning_task.py CHANGED
@@ -32,9 +32,9 @@ class CommonsenseReasoningTask(BaseTask):
32
  context = row["context"]
33
 
34
  # Prints for debugging
35
- print(f"Choices: {choices}")
36
- print("Type of choices:", type(choices))
37
- print("Type of answer:", type(answer))
38
 
39
  # Get answer index (starting from 0)
40
  if type(answer) == int:
@@ -62,9 +62,9 @@ class CommonsenseReasoningTask(BaseTask):
62
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
63
 
64
  # Print answers
65
- print(f"Correct Answer: {correct_answer_letter}")
66
- print(f"Model Answer: {model_answer}")
67
- print(f"Model Answer Cleaned: {model_answer_cleaned}")
68
 
69
  # Check if correct based on metric
70
  if correct_answer_letter == model_answer_cleaned:
 
32
  context = row["context"]
33
 
34
  # Prints for debugging
35
+ # print(f"Choices: {choices}")
36
+ # print("Type of choices:", type(choices))
37
+ # print("Type of answer:", type(answer))
38
 
39
  # Get answer index (starting from 0)
40
  if type(answer) == int:
 
62
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
63
 
64
  # Print answers
65
+ # print(f"Correct Answer: {correct_answer_letter}")
66
+ # print(f"Model Answer: {model_answer}")
67
+ # print(f"Model Answer Cleaned: {model_answer_cleaned}")
68
 
69
  # Check if correct based on metric
70
  if correct_answer_letter == model_answer_cleaned:
src/deepeval/complex_reasoning.py CHANGED
@@ -33,8 +33,8 @@ class ComplexReasoningTask(BaseTask):
33
  correct_answers.append(correct_answer_letter)
34
 
35
  # Prints for debugging
36
- print(f"Choices: {choices}")
37
- print("Type of choices:", type(choices))
38
 
39
 
40
  # Construct the prompt/message
@@ -50,9 +50,9 @@ class ComplexReasoningTask(BaseTask):
50
  if correct_answer_letter == model_answer_cleaned:
51
  true += 1
52
  # Print answers
53
- print(f"Correct Answer: {correct_answer_letter}")
54
- print(f"Model Answer: {model_answer}")
55
- print(f"Model Answer Cleaned: {model_answer_cleaned}")
56
 
57
  print("Answers:", correct_answers)
58
  print("Results:", responses)
 
33
  correct_answers.append(correct_answer_letter)
34
 
35
  # Prints for debugging
36
+ # print(f"Choices: {choices}")
37
+ # print("Type of choices:", type(choices))
38
 
39
 
40
  # Construct the prompt/message
 
50
  if correct_answer_letter == model_answer_cleaned:
51
  true += 1
52
  # Print answers
53
+ # print(f"Correct Answer: {correct_answer_letter}")
54
+ # print(f"Model Answer: {model_answer}")
55
+ # print(f"Model Answer Cleaned: {model_answer_cleaned}")
56
 
57
  print("Answers:", correct_answers)
58
  print("Results:", responses)
src/deepeval/deepeval_task_manager.py CHANGED
@@ -69,12 +69,12 @@ class DeepEvalTaskManager:
69
  task_value = task_enum.value
70
  results[task_value] = task_method() # Call the stored method reference
71
  end_time = datetime.now()
72
- print(f"Task {task_name} completed in {end_time - start_time} seconds.")
73
  except Exception as e:
74
  print(f"Error At Task: {task_name} - {e}")
75
  continue
76
  total_end_time = datetime.now()
77
- print(f"All tasks completed in {total_end_time - total_start_time} seconds.")
78
  print("All tasks completed.")
79
  return results
80
 
 
69
  task_value = task_enum.value
70
  results[task_value] = task_method() # Call the stored method reference
71
  end_time = datetime.now()
72
+ print(f"Task {task_name} completed in {(end_time - start_time).seconds} seconds.")
73
  except Exception as e:
74
  print(f"Error At Task: {task_name} - {e}")
75
  continue
76
  total_end_time = datetime.now()
77
+ print(f"All tasks completed in {(total_end_time - total_start_time).seconds} seconds.")
78
  print("All tasks completed.")
79
  return results
80
 
src/deepeval/nli.py CHANGED
@@ -36,9 +36,9 @@ class NLITask(BaseTask):
36
 
37
 
38
  # Prints for debugging
39
- print(f"Choices: {choices}")
40
- print("Type of choices:", type(choices))
41
- print("Label:", label)
42
 
43
  # Construct the prompt/message
44
  instruction = ""
@@ -53,9 +53,9 @@ class NLITask(BaseTask):
53
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
54
 
55
  # Print answers
56
- print(f"Correct Answer: {correct_answer_letter}")
57
- print(f"Model Answer: {model_answer}")
58
- print(f"Model Answer Cleaned: {model_answer_cleaned}")
59
 
60
  # Check if correct based on metric
61
  if correct_answer_letter == model_answer_cleaned:
 
36
 
37
 
38
  # Prints for debugging
39
+ # print(f"Choices: {choices}")
40
+ # print("Type of choices:", type(choices))
41
+ # print("Label:", label)
42
 
43
  # Construct the prompt/message
44
  instruction = ""
 
53
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
54
 
55
  # Print answers
56
+ # print(f"Correct Answer: {correct_answer_letter}")
57
+ # print(f"Model Answer: {model_answer}")
58
+ # print(f"Model Answer Cleaned: {model_answer_cleaned}")
59
 
60
  # Check if correct based on metric
61
  if correct_answer_letter == model_answer_cleaned:
src/deepeval/reading_comp_mc.py CHANGED
@@ -32,9 +32,9 @@ class ReadingComprehensionMCTask(BaseTask):
32
  question_about_the_text = row["question_about_the_text"]
33
 
34
  # Prints for debugging
35
- print(f"Choices: {choices}")
36
- print("Type of choices:", type(choices))
37
- print("Type of answer:", type(answer))
38
 
39
  # Get answer index (starting from 0)
40
  if type(answer) == int:
@@ -57,9 +57,9 @@ class ReadingComprehensionMCTask(BaseTask):
57
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
58
 
59
  # Print answers
60
- print(f"Correct Answer: {correct_answer_letter}")
61
- print(f"Model Answer: {model_answer}")
62
- print(f"Model Answer Cleaned: {model_answer_cleaned}")
63
 
64
  # Check if correct based on metric
65
  if correct_answer_letter == model_answer_cleaned:
 
32
  question_about_the_text = row["question_about_the_text"]
33
 
34
  # Prints for debugging
35
+ # print(f"Choices: {choices}")
36
+ # print("Type of choices:", type(choices))
37
+ # print("Type of answer:", type(answer))
38
 
39
  # Get answer index (starting from 0)
40
  if type(answer) == int:
 
57
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper().replace(':','')
58
 
59
  # Print answers
60
+ # print(f"Correct Answer: {correct_answer_letter}")
61
+ # print(f"Model Answer: {model_answer}")
62
+ # print(f"Model Answer Cleaned: {model_answer_cleaned}")
63
 
64
  # Check if correct based on metric
65
  if correct_answer_letter == model_answer_cleaned:
src/deepeval/sentiment_analysis_task.py CHANGED
@@ -23,7 +23,7 @@ class SentimentAnalysisTask(BaseTask):
23
  prompt = f"Verilen metin hangi duyguyu ifade ediyor? {sentence}\n {formatted_choices}"
24
  messages = prompt
25
  answer = self.generate_response_mcqa_multi_token(messages, choices=choices)
26
- print("Answer:", answer)
27
  responses.append(answer)
28
  correct_answer_letter = "A" if row["sentiment"] == "positive" else "B" if row["sentiment"] == "negative" else "C" if row["sentiment"] == "neutral" else None
29
  model_answer_cleaned = answer.strip().replace('\n', '').replace(' ', '').upper()
 
23
  prompt = f"Verilen metin hangi duyguyu ifade ediyor? {sentence}\n {formatted_choices}"
24
  messages = prompt
25
  answer = self.generate_response_mcqa_multi_token(messages, choices=choices)
26
+ #print("Answer:", answer)
27
  responses.append(answer)
28
  correct_answer_letter = "A" if row["sentiment"] == "positive" else "B" if row["sentiment"] == "negative" else "C" if row["sentiment"] == "neutral" else None
29
  model_answer_cleaned = answer.strip().replace('\n', '').replace(' ', '').upper()
src/deepeval/summarization_task.py CHANGED
@@ -23,8 +23,8 @@ class SummarizationTask(BaseTask):
23
  )
24
 
25
  generated_summary = self.generate_response(prompt, max_new_tokens=200)
26
- print(f"Text: {text_data}\n")
27
- print(f"Summary: {generated_summary}\n")
28
  test_case = LLMTestCase(input=text_data, actual_output=generated_summary)
29
 
30
  metric = SummarizationMetric(
@@ -33,8 +33,8 @@ class SummarizationTask(BaseTask):
33
  )
34
  metric.measure(test_case)
35
 
36
- print(f"Reason: {metric.reason}")
37
- print(f"Score Breakdown: {metric.score_breakdown}")
38
  results.append({
39
  "index": i,
40
  "score": metric.score,
 
23
  )
24
 
25
  generated_summary = self.generate_response(prompt, max_new_tokens=200)
26
+ # print(f"Text: {text_data}\n")
27
+ # print(f"Summary: {generated_summary}\n")
28
  test_case = LLMTestCase(input=text_data, actual_output=generated_summary)
29
 
30
  metric = SummarizationMetric(
 
33
  )
34
  metric.measure(test_case)
35
 
36
+ # print(f"Reason: {metric.reason}")
37
+ # print(f"Score Breakdown: {metric.score_breakdown}")
38
  results.append({
39
  "index": i,
40
  "score": metric.score,
src/deepeval/turkish_general_knowledge_task.py CHANGED
@@ -24,8 +24,8 @@ class TurkishGeneralKnowledgeTask(BaseTask):
24
  answer_index = row["answer"] # Assuming it's zero-based index
25
  difficulty = row["difficulty"]
26
 
27
- print(f"Choices: {choices}")
28
- print("Type of choices:", type(choices))
29
  # Categorize difficulty
30
  if difficulty <= 3:
31
  category = 'easy'
@@ -44,15 +44,15 @@ class TurkishGeneralKnowledgeTask(BaseTask):
44
  #"""
45
  model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
46
  responses.append(model_answer)
47
- print(f"Correct Answer: {choices[answer_index]}")
48
- print(f"Model Answer: {model_answer}")
49
 
50
  #TODO: Make the cleaning in the mcqa function
51
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
52
 
53
  # Check if the answer is correct
54
  correct_answer_letter = chr(65 + answer_index)
55
- print("Correct Answer Letter:", correct_answer_letter)
56
 
57
  if correct_answer_letter == model_answer_cleaned:
58
  true += 1
 
24
  answer_index = row["answer"] # Assuming it's zero-based index
25
  difficulty = row["difficulty"]
26
 
27
+ # print(f"Choices: {choices}")
28
+ # print("Type of choices:", type(choices))
29
  # Categorize difficulty
30
  if difficulty <= 3:
31
  category = 'easy'
 
44
  #"""
45
  model_answer = self.generate_response_mcqa_multi_token(message, choices=choices, max_new_tokens=2)
46
  responses.append(model_answer)
47
+ # print(f"Correct Answer: {choices[answer_index]}")
48
+ # print(f"Model Answer: {model_answer}")
49
 
50
  #TODO: Make the cleaning in the mcqa function
51
  model_answer_cleaned = model_answer.strip().replace('\n', '').replace(' ', '').upper()
52
 
53
  # Check if the answer is correct
54
  correct_answer_letter = chr(65 + answer_index)
55
+ # print("Correct Answer Letter:", correct_answer_letter)
56
 
57
  if correct_answer_letter == model_answer_cleaned:
58
  true += 1