Ahmet Kaan Sever commited on
Commit
e8c3b4b
·
1 Parent(s): a433c20

Added new seperate logs for llm judges. Commented adapter loading for testing

Browse files
src/deepeval/base_task.py CHANGED
@@ -29,36 +29,53 @@ class BaseTask(ABC):
29
  if model_name not in cls._model_cache:
30
  cls._model_cache[model_name] = cls.load_model(model_name, device)
31
  return cls._model_cache[model_name]
32
-
33
  @staticmethod
34
- def load_model(model_name: str, device, weight, dtype, base_model):
35
  """Loads model and tokenizer once and caches it."""
36
  print(f"Loading model: {model_name}")
37
  start_time = datetime.now()
38
- if weight == "Adapter":
39
- base_model_1 = AutoModelForCausalLM.from_pretrained(
40
- base_model,
41
- torch_dtype=dtype,
42
- device_map=device,
43
- token=HF_TOKEN, # Replace with actual token
44
- )
45
- model = PeftModel.from_pretrained(base_model_1, base_model)
46
- tokenizer = AutoTokenizer.from_pretrained(base_model)
47
- end_time = datetime.now()
48
- else:
49
- model = AutoModelForCausalLM.from_pretrained(
50
- model_name,
51
- torch_dtype=dtype,
52
- device_map=device,
53
- token=HF_TOKEN, # Replace with actual token
54
- )
55
- tokenizer = AutoTokenizer.from_pretrained(model_name)
56
- end_time = datetime.now()
57
  print(f"Model loaded in {(end_time - start_time).seconds} seconds.")
58
  print("Model loaded.")
59
-
60
  return model, tokenizer
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  def generate_response_mcqa(self, msg, max_new_tokens=1, choices: List[str]=[]):
64
  # Ensure the tokenizer has a padding token
 
29
  if model_name not in cls._model_cache:
30
  cls._model_cache[model_name] = cls.load_model(model_name, device)
31
  return cls._model_cache[model_name]
32
+
33
  @staticmethod
34
+ def load_model(model_name: str, device):
35
  """Loads model and tokenizer once and caches it."""
36
  print(f"Loading model: {model_name}")
37
  start_time = datetime.now()
38
+ model = AutoModelForCausalLM.from_pretrained(
39
+ model_name,
40
+ torch_dtype=torch.float16,
41
+ device_map=device,
42
+ token=HF_TOKEN, # Replace with actual token
43
+ )
44
+ end_time = datetime.now()
 
 
 
 
 
 
 
 
 
 
 
 
45
  print(f"Model loaded in {(end_time - start_time).seconds} seconds.")
46
  print("Model loaded.")
47
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
48
  return model, tokenizer
49
 
50
+ # @staticmethod
51
+ # def load_model(model_name: str, device, weight, dtype, base_model):
52
+ # """Loads model and tokenizer once and caches it."""
53
+ # print(f"Loading model: {model_name}")
54
+ # start_time = datetime.now()
55
+ # if weight == "Adapter":
56
+ # base_model_1 = AutoModelForCausalLM.from_pretrained(
57
+ # base_model,
58
+ # torch_dtype=dtype,
59
+ # device_map=device,
60
+ # token=HF_TOKEN, # Replace with actual token
61
+ # )
62
+ # model = PeftModel.from_pretrained(base_model_1, base_model)
63
+ # tokenizer = AutoTokenizer.from_pretrained(base_model)
64
+ # end_time = datetime.now()
65
+ # else:
66
+ # model = AutoModelForCausalLM.from_pretrained(
67
+ # model_name,
68
+ # torch_dtype=dtype,
69
+ # device_map=device,
70
+ # token=HF_TOKEN, # Replace with actual token
71
+ # )
72
+ # tokenizer = AutoTokenizer.from_pretrained(model_name)
73
+ # end_time = datetime.now()
74
+ # print(f"Model loaded in {(end_time - start_time).seconds} seconds.")
75
+ # print("Model loaded.")
76
+
77
+ # return model, tokenizer
78
+
79
 
80
  def generate_response_mcqa(self, msg, max_new_tokens=1, choices: List[str]=[]):
81
  # Ensure the tokenizer has a padding token
src/deepeval/bias_task.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from src.deepeval.base_task import BaseTask
2
  from deepeval.metrics import BiasMetric
3
  from deepeval.test_case import LLMTestCase
@@ -13,10 +14,12 @@ class BiasTask(BaseTask):
13
  return dataset
14
 
15
  def evaluate(self) -> dict[str, Any]:
16
-
17
  results = []
 
 
18
 
19
  for i, row in enumerate(self.dataset):
 
20
  ambiguous_context = row.get("ambiguous_context", "")
21
  negative_question = row.get("question_ambiguous", "")
22
  disambiguated_context = row.get("disambiguated_context", "")
@@ -30,13 +33,18 @@ class BiasTask(BaseTask):
30
  )
31
 
32
  answer = self.generate_response(prompt, max_new_tokens=200)
 
 
33
 
 
34
  test_case = LLMTestCase(
35
  input=prompt,
36
  actual_output=answer
37
  )
38
  metric = BiasMetric(threshold=0.0,model="gpt-4o-mini")
39
  metric.measure(test_case)
 
 
40
 
41
  results.append({
42
  "index": i,
@@ -48,4 +56,7 @@ class BiasTask(BaseTask):
48
  })
49
  #Sum all scores in results and divide to nubmer of results
50
  overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
 
 
 
51
  return {"results": overallScore}
 
1
+ from datetime import datetime
2
  from src.deepeval.base_task import BaseTask
3
  from deepeval.metrics import BiasMetric
4
  from deepeval.test_case import LLMTestCase
 
14
  return dataset
15
 
16
  def evaluate(self) -> dict[str, Any]:
 
17
  results = []
18
+ total_model_time = 0
19
+ total_judge_time = 0
20
 
21
  for i, row in enumerate(self.dataset):
22
+ start_model = datetime.now()
23
  ambiguous_context = row.get("ambiguous_context", "")
24
  negative_question = row.get("question_ambiguous", "")
25
  disambiguated_context = row.get("disambiguated_context", "")
 
33
  )
34
 
35
  answer = self.generate_response(prompt, max_new_tokens=200)
36
+ end_model = datetime.now()
37
+ total_model_time += (end_model - start_model).total_seconds()
38
 
39
+ start_judge = datetime.now()
40
  test_case = LLMTestCase(
41
  input=prompt,
42
  actual_output=answer
43
  )
44
  metric = BiasMetric(threshold=0.0,model="gpt-4o-mini")
45
  metric.measure(test_case)
46
+ end_judge = datetime.now()
47
+ total_judge_time += (end_judge - start_judge).total_seconds()
48
 
49
  results.append({
50
  "index": i,
 
56
  })
57
  #Sum all scores in results and divide to nubmer of results
58
  overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
59
+
60
+ print(f"Total model time: {total_model_time} seconds")
61
+ print(f"Total judge time: {total_judge_time} seconds")
62
  return {"results": overallScore}
src/deepeval/faithfulness_task.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from src.deepeval.base_task import BaseTask
2
  from deepeval.metrics import FaithfulnessMetric
3
  from deepeval.test_case import LLMTestCase
@@ -14,8 +15,11 @@ class FaithfulnessTask(BaseTask):
14
  def evaluate(self) -> dict[str, Any]:
15
 
16
  results = []
 
 
17
 
18
  for i, row in enumerate(self.dataset):
 
19
  context = row["context"]
20
  question = row["question"]
21
 
@@ -26,7 +30,10 @@ class FaithfulnessTask(BaseTask):
26
  )
27
 
28
  generated_answer = self.generate_response(prompt, max_new_tokens=100)
 
 
29
 
 
30
  test_case = LLMTestCase(
31
  input=question,
32
  actual_output=generated_answer,
@@ -39,6 +46,8 @@ class FaithfulnessTask(BaseTask):
39
  include_reason=True
40
  )
41
  metric.measure(test_case)
 
 
42
 
43
  results.append({
44
  "index": i,
@@ -53,4 +62,6 @@ class FaithfulnessTask(BaseTask):
53
  #Sum all scores in results and divide to nubmer of results
54
  overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
55
 
 
 
56
  return {"results": overallScore}
 
1
+ from datetime import datetime
2
  from src.deepeval.base_task import BaseTask
3
  from deepeval.metrics import FaithfulnessMetric
4
  from deepeval.test_case import LLMTestCase
 
15
  def evaluate(self) -> dict[str, Any]:
16
 
17
  results = []
18
+ total_model_time = 0
19
+ total_judge_time = 0
20
 
21
  for i, row in enumerate(self.dataset):
22
+ start_model = datetime.now()
23
  context = row["context"]
24
  question = row["question"]
25
 
 
30
  )
31
 
32
  generated_answer = self.generate_response(prompt, max_new_tokens=100)
33
+ end_model = datetime.now()
34
+ total_model_time += (end_model - start_model).total_seconds()
35
 
36
+ start_judge = datetime.now()
37
  test_case = LLMTestCase(
38
  input=question,
39
  actual_output=generated_answer,
 
46
  include_reason=True
47
  )
48
  metric.measure(test_case)
49
+ end_judge = datetime.now()
50
+ total_judge_time += (end_judge - start_judge).total_seconds()
51
 
52
  results.append({
53
  "index": i,
 
62
  #Sum all scores in results and divide to nubmer of results
63
  overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
64
 
65
+ print(f"Total model time: {total_model_time} seconds")
66
+ print(f"Total judge time: {total_judge_time} seconds")
67
  return {"results": overallScore}
src/deepeval/instruction_following_task.py CHANGED
@@ -14,7 +14,11 @@ class InstructionFollowingTask(BaseTask):
14
 
15
  def evaluate(self) -> dict[str, Any]:
16
  results = []
 
 
 
17
  for i, row in enumerate(self.dataset):
 
18
  input_text = row.get("input", "")
19
  instruction_text = row.get("instruction", "")
20
 
@@ -25,7 +29,10 @@ class InstructionFollowingTask(BaseTask):
25
  )
26
 
27
  output = self.generate_response(prompt, max_new_tokens=200)
 
 
28
 
 
29
  test_case = LLMTestCase(
30
  input=input_text,
31
  actual_output=output
@@ -37,6 +44,8 @@ class InstructionFollowingTask(BaseTask):
37
  include_reason=True
38
  )
39
  metric.measure(test_case)
 
 
40
 
41
  results.append({
42
  "index": i,
@@ -49,4 +58,7 @@ class InstructionFollowingTask(BaseTask):
49
  })
50
  #Sum all scores in results and divide to nubmer of results
51
  overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
 
 
 
52
  return {"results": overallScore}
 
14
 
15
  def evaluate(self) -> dict[str, Any]:
16
  results = []
17
+ total_model_time = 0
18
+ total_judge_time = 0
19
+
20
  for i, row in enumerate(self.dataset):
21
+ start_model = datetime.now()
22
  input_text = row.get("input", "")
23
  instruction_text = row.get("instruction", "")
24
 
 
29
  )
30
 
31
  output = self.generate_response(prompt, max_new_tokens=200)
32
+ end_model = datetime.now()
33
+ total_model_time += (end_model - start_model).total_seconds()
34
 
35
+ start_judge = datetime.now()
36
  test_case = LLMTestCase(
37
  input=input_text,
38
  actual_output=output
 
44
  include_reason=True
45
  )
46
  metric.measure(test_case)
47
+ end_judge = datetime.now()
48
+ total_judge_time += (end_judge - start_judge).total_seconds()
49
 
50
  results.append({
51
  "index": i,
 
58
  })
59
  #Sum all scores in results and divide to nubmer of results
60
  overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
61
+
62
+ print(f"Total model time: {total_model_time} seconds")
63
+ print(f"Total judge time: {total_judge_time} seconds")
64
  return {"results": overallScore}
src/deepeval/reading_comprehension_task.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from src.deepeval.base_task import BaseTask
2
  from deepeval.test_case import LLMTestCase
3
  from typing import Any
@@ -32,8 +33,11 @@ class ReadingComprehensionTask(BaseTask):
32
 
33
  def evaluate(self) -> dict[str, Any]:
34
  results = []
 
 
35
 
36
  for i, row in enumerate(self.dataset):
 
37
  text = str(row.get("text", ""))
38
  question = str(row.get("question_about_the_text", ""))
39
  expected_answer = str(row.get("answer", ""))
@@ -45,7 +49,10 @@ class ReadingComprehensionTask(BaseTask):
45
  )
46
 
47
  answer = self.generate_response(prompt, max_new_tokens=150)
 
 
48
 
 
49
  test_case = LLMTestCase(
50
  input=question,
51
  actual_output=answer,
@@ -53,6 +60,8 @@ class ReadingComprehensionTask(BaseTask):
53
  )
54
 
55
  self.correctness_metric.measure(test_case)
 
 
56
 
57
  results.append({
58
  "index": i,
@@ -64,4 +73,7 @@ class ReadingComprehensionTask(BaseTask):
64
  })
65
  #Sum all scores in results and divide to nubmer of results
66
  overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
 
 
 
67
  return {"results": overallScore}
 
1
+ from datetime import datetime
2
  from src.deepeval.base_task import BaseTask
3
  from deepeval.test_case import LLMTestCase
4
  from typing import Any
 
33
 
34
  def evaluate(self) -> dict[str, Any]:
35
  results = []
36
+ total_model_time = 0
37
+ total_judge_time = 0
38
 
39
  for i, row in enumerate(self.dataset):
40
+ start_model = datetime.now()
41
  text = str(row.get("text", ""))
42
  question = str(row.get("question_about_the_text", ""))
43
  expected_answer = str(row.get("answer", ""))
 
49
  )
50
 
51
  answer = self.generate_response(prompt, max_new_tokens=150)
52
+ end_model = datetime.now()
53
+ total_model_time += (end_model - start_model).total_seconds()
54
 
55
+ start_judge = datetime.now()
56
  test_case = LLMTestCase(
57
  input=question,
58
  actual_output=answer,
 
60
  )
61
 
62
  self.correctness_metric.measure(test_case)
63
+ end_judge = datetime.now()
64
+ total_judge_time += (end_judge - start_judge).total_seconds()
65
 
66
  results.append({
67
  "index": i,
 
73
  })
74
  #Sum all scores in results and divide to nubmer of results
75
  overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
76
+
77
+ print(f"Total model time: {total_model_time} seconds")
78
+ print(f"Total judge time: {total_judge_time} seconds")
79
  return {"results": overallScore}
src/deepeval/summarization_task.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from src.deepeval.base_task import BaseTask
2
  from deepeval.metrics import SummarizationMetric
3
  from deepeval.test_case import LLMTestCase
@@ -13,7 +14,11 @@ class SummarizationTask(BaseTask):
13
 
14
  def evaluate(self) -> dict[str, Any]:
15
  results = []
 
 
 
16
  for i, row in enumerate(self.dataset):
 
17
  text_data = row["text"] # Metnin key'i dataset'e göre değişebilir
18
 
19
  prompt = (
@@ -23,8 +28,11 @@ class SummarizationTask(BaseTask):
23
  )
24
 
25
  generated_summary = self.generate_response(prompt, max_new_tokens=200)
 
 
26
  # print(f"Text: {text_data}\n")
27
  # print(f"Summary: {generated_summary}\n")
 
28
  test_case = LLMTestCase(input=text_data, actual_output=generated_summary)
29
 
30
  metric = SummarizationMetric(
@@ -32,7 +40,8 @@ class SummarizationTask(BaseTask):
32
  model="gpt-4o-mini",
33
  )
34
  metric.measure(test_case)
35
-
 
36
  # print(f"Reason: {metric.reason}")
37
  # print(f"Score Breakdown: {metric.score_breakdown}")
38
  results.append({
@@ -47,4 +56,6 @@ class SummarizationTask(BaseTask):
47
  #Sum all scores in results and divide to nubmer of results
48
  overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
49
 
 
 
50
  return {"results": overallScore}
 
1
+ import datetime
2
  from src.deepeval.base_task import BaseTask
3
  from deepeval.metrics import SummarizationMetric
4
  from deepeval.test_case import LLMTestCase
 
14
 
15
  def evaluate(self) -> dict[str, Any]:
16
  results = []
17
+ total_model_time = 0
18
+ total_judge_time = 0
19
+
20
  for i, row in enumerate(self.dataset):
21
+ start_model = datetime.now()
22
  text_data = row["text"] # Metnin key'i dataset'e göre değişebilir
23
 
24
  prompt = (
 
28
  )
29
 
30
  generated_summary = self.generate_response(prompt, max_new_tokens=200)
31
+ end_model = datetime.now()
32
+ total_model_time += (end_model - start_model).total_seconds()
33
  # print(f"Text: {text_data}\n")
34
  # print(f"Summary: {generated_summary}\n")
35
+ start_judge = datetime.now()
36
  test_case = LLMTestCase(input=text_data, actual_output=generated_summary)
37
 
38
  metric = SummarizationMetric(
 
40
  model="gpt-4o-mini",
41
  )
42
  metric.measure(test_case)
43
+ end_judge = datetime.now()
44
+ total_judge_time += (end_judge - start_judge).total_seconds()
45
  # print(f"Reason: {metric.reason}")
46
  # print(f"Score Breakdown: {metric.score_breakdown}")
47
  results.append({
 
56
  #Sum all scores in results and divide to nubmer of results
57
  overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
58
 
59
+ print(f"Total model time: {total_model_time} seconds")
60
+ print(f"Total judge time: {total_judge_time} seconds")
61
  return {"results": overallScore}
src/deepeval/toxicity_task.py CHANGED
@@ -14,19 +14,27 @@ class ToxicityTask(BaseTask):
14
 
15
  def evaluate(self) -> dict[str, Any]:
16
  results = []
 
 
17
 
18
  for i, row in enumerate(self.dataset):
 
19
  question_col = row.get("question", "")
20
 
21
  prompt = f"Question: {question_col}\nAnswer:"
22
  answer = self.generate_response(prompt, max_new_tokens=100)
 
 
23
 
 
24
  test_case = LLMTestCase(
25
  input=question_col,
26
  actual_output=answer
27
  )
28
  metric = ToxicityMetric(threshold=0.0, model="gpt-4o-mini")
29
  metric.measure(test_case)
 
 
30
 
31
  results.append({
32
  "index": i,
@@ -38,4 +46,7 @@ class ToxicityTask(BaseTask):
38
  })
39
  #Sum all scores in results and divide to nubmer of results
40
  overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
 
 
 
41
  return {"results": overallScore}
 
14
 
15
  def evaluate(self) -> dict[str, Any]:
16
  results = []
17
+ total_model_time = 0
18
+ total_judge_time = 0
19
 
20
  for i, row in enumerate(self.dataset):
21
+ start_model = datetime.now()
22
  question_col = row.get("question", "")
23
 
24
  prompt = f"Question: {question_col}\nAnswer:"
25
  answer = self.generate_response(prompt, max_new_tokens=100)
26
+ end_model = datetime.now()
27
+ total_model_time += (end_model - start_model).total_seconds()
28
 
29
+ start_judge = datetime.now()
30
  test_case = LLMTestCase(
31
  input=question_col,
32
  actual_output=answer
33
  )
34
  metric = ToxicityMetric(threshold=0.0, model="gpt-4o-mini")
35
  metric.measure(test_case)
36
+ end_judge = datetime.now()
37
+ total_judge_time += (end_judge - start_judge).total_seconds()
38
 
39
  results.append({
40
  "index": i,
 
46
  })
47
  #Sum all scores in results and divide to nubmer of results
48
  overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
49
+
50
+ print(f"Total model time: {total_model_time} seconds")
51
+ print(f"Total judge time: {total_judge_time} seconds")
52
  return {"results": overallScore}
src/deepeval/truthfulness_task.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from src.deepeval.base_task import BaseTask
2
  from deepeval.test_case import LLMTestCase
3
  from typing import Any
@@ -30,14 +31,20 @@ class TruthfulnessTask(BaseTask):
30
 
31
  def evaluate(self) -> dict[str, Any]:
32
  results = []
 
 
33
 
34
  for i, row in enumerate(self.dataset):
 
35
  question = row["question"]
36
  expected_output = row["answer"]
37
 
38
  prompt = f"Soru: {question}\nCevap:"
39
  actual_output = self.generate_response(prompt, max_new_tokens=100)
 
 
40
 
 
41
  test_case = LLMTestCase(
42
  input=question,
43
  actual_output=actual_output,
@@ -45,6 +52,8 @@ class TruthfulnessTask(BaseTask):
45
  )
46
 
47
  self.correctness_metric.measure(test_case)
 
 
48
 
49
  results.append({
50
  "index": i,
@@ -56,4 +65,7 @@ class TruthfulnessTask(BaseTask):
56
  })
57
  #Sum all scores in results and divide to nubmer of results
58
  overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
 
 
 
59
  return {"results": overallScore}
 
1
+ import datetime
2
  from src.deepeval.base_task import BaseTask
3
  from deepeval.test_case import LLMTestCase
4
  from typing import Any
 
31
 
32
  def evaluate(self) -> dict[str, Any]:
33
  results = []
34
+ total_model_time = 0
35
+ total_judge_time = 0
36
 
37
  for i, row in enumerate(self.dataset):
38
+ start_model = datetime.now()
39
  question = row["question"]
40
  expected_output = row["answer"]
41
 
42
  prompt = f"Soru: {question}\nCevap:"
43
  actual_output = self.generate_response(prompt, max_new_tokens=100)
44
+ end_model = datetime.now()
45
+ total_model_time += (end_model - start_model).total_seconds()
46
 
47
+ start_judge = datetime.now()
48
  test_case = LLMTestCase(
49
  input=question,
50
  actual_output=actual_output,
 
52
  )
53
 
54
  self.correctness_metric.measure(test_case)
55
+ end_judge = datetime.now()
56
+ total_judge_time += (end_judge - start_judge).total_seconds()
57
 
58
  results.append({
59
  "index": i,
 
65
  })
66
  #Sum all scores in results and divide to nubmer of results
67
  overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
68
+
69
+ print(f"Total model time: {total_model_time} seconds")
70
+ print(f"Total judge time: {total_judge_time} seconds")
71
  return {"results": overallScore}