Spaces:

metunlp
/

model-eval-be

Sleeping

model-eval-be / src /deepeval /reading_comprehension_task.py

Ahmet Kaan Sever

Fixed lm judge abstraction

f74f2a9 4 days ago

3.21 kB

	from datetime import datetime
	from src.deepeval.base_task import BaseTask
	from deepeval.test_case import LLMTestCase
	from typing import Any
	from deepeval.metrics import GEval
	from deepeval.test_case import LLMTestCaseParams

	class ReadingComprehensionTask(BaseTask):
	def __init__(self, model_name: str):
	super().__init__("metunlp/reading_comp_oe", model_name=model_name)

	self.correctness_metric = GEval(
	name="readingcomprehension",
	criteria="Determine whether the actual output is factually correct based on the expected output.",
	evaluation_steps=[
	"Is the answer correct according to the context?",
	"Does the answer focus on the question using the given context (no unsupported info)?",
	"Does the answer address all parts of the question?",
	"Is the answer internally coherent and plausible?",
	"Is the answer well-written?"
	],
	model="gpt-4o-mini",
	evaluation_params=[
	LLMTestCaseParams.INPUT,
	LLMTestCaseParams.ACTUAL_OUTPUT,
	LLMTestCaseParams.EXPECTED_OUTPUT
	],
	)

	def load_dataset_from_hf(self):
	dataset = super().load_dataset_lmjudge_from_hf()
	return dataset

	def evaluate(self) -> dict[str, Any]:
	results = []
	total_model_time = 0
	total_judge_time = 0

	for i, row in enumerate(self.dataset):
	start_model = datetime.now()
	text = str(row.get("text", ""))
	question = str(row.get("question_about_the_text", ""))
	expected_answer = str(row.get("answer", ""))

	prompt = (
	f"Verilen paragrafa bakarak aşağıdaki soruyu cevaplayın:\n\n"
	f"Paragraf: {text}\n\n"
	f"Soru: {question}"
	)

	answer = self.generate_response(prompt, max_new_tokens=150)
	end_model = datetime.now()
	total_model_time += (end_model - start_model).total_seconds()

	start_judge = datetime.now()
	test_case = LLMTestCase(
	input=question,
	actual_output=answer,
	expected_output=expected_answer
	)

	self.correctness_metric.measure(test_case)
	end_judge = datetime.now()
	total_judge_time += (end_judge - start_judge).total_seconds()

	results.append({
	"index": i,
	"score": self.correctness_metric.score,
	"reason": self.correctness_metric.reason,
	"input": question,
	"expected_output": expected_answer,
	"actual_output": answer
	})
	#Sum all scores in results and divide to nubmer of results
	overallScore = (sum([result["score"] for result in results]) / len(results)) * 100

	print(f"Total model time: {total_model_time} seconds")
	print(f"Total judge time: {total_judge_time} seconds")
	return {"results": overallScore}