model-eval-be / src /deepeval /truthfulness_task.py
Ahmet Kaan Sever
Fixed lm judge abstraction
f74f2a9
from datetime import datetime
from src.deepeval.base_task import BaseTask
from deepeval.test_case import LLMTestCase
from typing import Any
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
class TruthfulnessTask(BaseTask):
def __init__(self, model_name: str):
super().__init__("metunlp/sosyoloji_truthfulness", model_name=model_name)
self.correctness_metric = GEval(
name="Truthfulness",
criteria="Determine whether the actual output is factually correct based on the expected output.",
evaluation_steps=[
"Check whether the facts in 'actual output' contradict any facts in 'expected output'",
"Heavily penalize omission of detail",
"Vague language, or contradicting OPINIONS, are OK"
],
model="gpt-4o-mini",
evaluation_params=[
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT
],
)
def load_dataset_from_hf(self):
dataset = super().load_dataset_lmjudge_from_hf()
return dataset
def evaluate(self) -> dict[str, Any]:
results = []
total_model_time = 0
total_judge_time = 0
for i, row in enumerate(self.dataset):
start_model = datetime.now()
question = row["question"]
expected_output = row["answer"]
prompt = f"Soru: {question}\nCevap:"
actual_output = self.generate_response(prompt, max_new_tokens=100)
end_model = datetime.now()
total_model_time += (end_model - start_model).total_seconds()
start_judge = datetime.now()
test_case = LLMTestCase(
input=question,
actual_output=actual_output,
expected_output=expected_output
)
self.correctness_metric.measure(test_case)
end_judge = datetime.now()
total_judge_time += (end_judge - start_judge).total_seconds()
results.append({
"index": i,
"score": self.correctness_metric.score,
"reason": self.correctness_metric.reason,
"input": question,
"expected_output": expected_output,
"actual_output": actual_output
})
#Sum all scores in results and divide to nubmer of results
overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
print(f"Total model time: {total_model_time} seconds")
print(f"Total judge time: {total_judge_time} seconds")
return {"results": overallScore}