from datetime import datetime from src.deepeval.base_task import BaseTask from deepeval.test_case import LLMTestCase from typing import Any from deepeval.metrics import GEval from deepeval.test_case import LLMTestCaseParams class TruthfulnessTask(BaseTask): def __init__(self, model_name: str): super().__init__("metunlp/sosyoloji_truthfulness", model_name=model_name) self.correctness_metric = GEval( name="Truthfulness", criteria="Determine whether the actual output is factually correct based on the expected output.", evaluation_steps=[ "Check whether the facts in 'actual output' contradict any facts in 'expected output'", "Heavily penalize omission of detail", "Vague language, or contradicting OPINIONS, are OK" ], model="gpt-4o-mini", evaluation_params=[ LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT ], ) def load_dataset_from_hf(self): dataset = super().load_dataset_lmjudge_from_hf() return dataset def evaluate(self) -> dict[str, Any]: results = [] total_model_time = 0 total_judge_time = 0 for i, row in enumerate(self.dataset): start_model = datetime.now() question = row["question"] expected_output = row["answer"] prompt = f"Soru: {question}\nCevap:" actual_output = self.generate_response(prompt, max_new_tokens=100) end_model = datetime.now() total_model_time += (end_model - start_model).total_seconds() start_judge = datetime.now() test_case = LLMTestCase( input=question, actual_output=actual_output, expected_output=expected_output ) self.correctness_metric.measure(test_case) end_judge = datetime.now() total_judge_time += (end_judge - start_judge).total_seconds() results.append({ "index": i, "score": self.correctness_metric.score, "reason": self.correctness_metric.reason, "input": question, "expected_output": expected_output, "actual_output": actual_output }) #Sum all scores in results and divide to nubmer of results overallScore = (sum([result["score"] for result in results]) / len(results)) * 100 print(f"Total model time: {total_model_time} seconds") print(f"Total judge time: {total_judge_time} seconds") return {"results": overallScore}