Spaces:
Sleeping
Sleeping
from datetime import datetime | |
from src.deepeval.base_task import BaseTask | |
from deepeval.test_case import LLMTestCase | |
from typing import Any | |
from deepeval.metrics import GEval | |
from deepeval.test_case import LLMTestCaseParams | |
class ReadingComprehensionTask(BaseTask): | |
def __init__(self, model_name: str): | |
super().__init__("metunlp/reading_comp_oe", model_name=model_name) | |
self.correctness_metric = GEval( | |
name="readingcomprehension", | |
criteria="Determine whether the actual output is factually correct based on the expected output.", | |
evaluation_steps=[ | |
"Is the answer correct according to the context?", | |
"Does the answer focus on the question using the given context (no unsupported info)?", | |
"Does the answer address all parts of the question?", | |
"Is the answer internally coherent and plausible?", | |
"Is the answer well-written?" | |
], | |
model="gpt-4o-mini", | |
evaluation_params=[ | |
LLMTestCaseParams.INPUT, | |
LLMTestCaseParams.ACTUAL_OUTPUT, | |
LLMTestCaseParams.EXPECTED_OUTPUT | |
], | |
) | |
def load_dataset_from_hf(self): | |
dataset = super().load_dataset_lmjudge_from_hf() | |
return dataset | |
def evaluate(self) -> dict[str, Any]: | |
results = [] | |
total_model_time = 0 | |
total_judge_time = 0 | |
for i, row in enumerate(self.dataset): | |
start_model = datetime.now() | |
text = str(row.get("text", "")) | |
question = str(row.get("question_about_the_text", "")) | |
expected_answer = str(row.get("answer", "")) | |
prompt = ( | |
f"Verilen paragrafa bakarak aşağıdaki soruyu cevaplayın:\n\n" | |
f"Paragraf: {text}\n\n" | |
f"Soru: {question}" | |
) | |
answer = self.generate_response(prompt, max_new_tokens=150) | |
end_model = datetime.now() | |
total_model_time += (end_model - start_model).total_seconds() | |
start_judge = datetime.now() | |
test_case = LLMTestCase( | |
input=question, | |
actual_output=answer, | |
expected_output=expected_answer | |
) | |
self.correctness_metric.measure(test_case) | |
end_judge = datetime.now() | |
total_judge_time += (end_judge - start_judge).total_seconds() | |
results.append({ | |
"index": i, | |
"score": self.correctness_metric.score, | |
"reason": self.correctness_metric.reason, | |
"input": question, | |
"expected_output": expected_answer, | |
"actual_output": answer | |
}) | |
#Sum all scores in results and divide to nubmer of results | |
overallScore = (sum([result["score"] for result in results]) / len(results)) * 100 | |
print(f"Total model time: {total_model_time} seconds") | |
print(f"Total judge time: {total_judge_time} seconds") | |
return {"results": overallScore} | |