from datetime import datetime from src.deepeval.base_task import BaseTask from deepeval.test_case import LLMTestCase from typing import Any from deepeval.metrics import GEval from deepeval.test_case import LLMTestCaseParams class ReadingComprehensionTask(BaseTask): def __init__(self, model_name: str): super().__init__("metunlp/reading_comp_oe", model_name=model_name) self.correctness_metric = GEval( name="readingcomprehension", criteria="Determine whether the actual output is factually correct based on the expected output.", evaluation_steps=[ "Is the answer correct according to the context?", "Does the answer focus on the question using the given context (no unsupported info)?", "Does the answer address all parts of the question?", "Is the answer internally coherent and plausible?", "Is the answer well-written?" ], model="gpt-4o-mini", evaluation_params=[ LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT ], ) def load_dataset_from_hf(self): dataset = super().load_dataset_lmjudge_from_hf() return dataset def evaluate(self) -> dict[str, Any]: results = [] total_model_time = 0 total_judge_time = 0 for i, row in enumerate(self.dataset): start_model = datetime.now() text = str(row.get("text", "")) question = str(row.get("question_about_the_text", "")) expected_answer = str(row.get("answer", "")) prompt = ( f"Verilen paragrafa bakarak aşağıdaki soruyu cevaplayın:\n\n" f"Paragraf: {text}\n\n" f"Soru: {question}" ) answer = self.generate_response(prompt, max_new_tokens=150) end_model = datetime.now() total_model_time += (end_model - start_model).total_seconds() start_judge = datetime.now() test_case = LLMTestCase( input=question, actual_output=answer, expected_output=expected_answer ) self.correctness_metric.measure(test_case) end_judge = datetime.now() total_judge_time += (end_judge - start_judge).total_seconds() results.append({ "index": i, "score": self.correctness_metric.score, "reason": self.correctness_metric.reason, "input": question, "expected_output": expected_answer, "actual_output": answer }) #Sum all scores in results and divide to nubmer of results overallScore = (sum([result["score"] for result in results]) / len(results)) * 100 print(f"Total model time: {total_model_time} seconds") print(f"Total judge time: {total_judge_time} seconds") return {"results": overallScore}