model-eval-be / src /deepeval /reading_comprehension_task.py
Ahmet Kaan Sever
Fixed lm judge abstraction
f74f2a9
from datetime import datetime
from src.deepeval.base_task import BaseTask
from deepeval.test_case import LLMTestCase
from typing import Any
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
class ReadingComprehensionTask(BaseTask):
def __init__(self, model_name: str):
super().__init__("metunlp/reading_comp_oe", model_name=model_name)
self.correctness_metric = GEval(
name="readingcomprehension",
criteria="Determine whether the actual output is factually correct based on the expected output.",
evaluation_steps=[
"Is the answer correct according to the context?",
"Does the answer focus on the question using the given context (no unsupported info)?",
"Does the answer address all parts of the question?",
"Is the answer internally coherent and plausible?",
"Is the answer well-written?"
],
model="gpt-4o-mini",
evaluation_params=[
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT
],
)
def load_dataset_from_hf(self):
dataset = super().load_dataset_lmjudge_from_hf()
return dataset
def evaluate(self) -> dict[str, Any]:
results = []
total_model_time = 0
total_judge_time = 0
for i, row in enumerate(self.dataset):
start_model = datetime.now()
text = str(row.get("text", ""))
question = str(row.get("question_about_the_text", ""))
expected_answer = str(row.get("answer", ""))
prompt = (
f"Verilen paragrafa bakarak aşağıdaki soruyu cevaplayın:\n\n"
f"Paragraf: {text}\n\n"
f"Soru: {question}"
)
answer = self.generate_response(prompt, max_new_tokens=150)
end_model = datetime.now()
total_model_time += (end_model - start_model).total_seconds()
start_judge = datetime.now()
test_case = LLMTestCase(
input=question,
actual_output=answer,
expected_output=expected_answer
)
self.correctness_metric.measure(test_case)
end_judge = datetime.now()
total_judge_time += (end_judge - start_judge).total_seconds()
results.append({
"index": i,
"score": self.correctness_metric.score,
"reason": self.correctness_metric.reason,
"input": question,
"expected_output": expected_answer,
"actual_output": answer
})
#Sum all scores in results and divide to nubmer of results
overallScore = (sum([result["score"] for result in results]) / len(results)) * 100
print(f"Total model time: {total_model_time} seconds")
print(f"Total judge time: {total_judge_time} seconds")
return {"results": overallScore}