from datetime import datetime from src.deepeval.base_task import BaseTask from deepeval.metrics import SummarizationMetric from deepeval.test_case import LLMTestCase from typing import Any class SummarizationTask(BaseTask): def __init__(self, model_name: str): super().__init__("metunlp/summarization_tr", model_name=model_name) def load_dataset_from_hf(self): dataset = super().load_dataset_lmjudge_from_hf() return dataset def evaluate(self) -> dict[str, Any]: results = [] total_model_time = 0 total_judge_time = 0 for i, row in enumerate(self.dataset): start_model = datetime.now() text_data = row["text"] # Metnin key'i dataset'e göre değişebilir prompt = ( f"Aşağıdaki metin için Türkçe bir özet oluşturun.\n" f"Metin: {text_data}\n\n" "Özet:" ) generated_summary = self.generate_response(prompt, max_new_tokens=200) end_model = datetime.now() total_model_time += (end_model - start_model).total_seconds() # print(f"Text: {text_data}\n") # print(f"Summary: {generated_summary}\n") start_judge = datetime.now() test_case = LLMTestCase(input=text_data, actual_output=generated_summary) metric = SummarizationMetric( threshold=0.0, model="gpt-4o-mini", ) metric.measure(test_case) end_judge = datetime.now() total_judge_time += (end_judge - start_judge).total_seconds() # print(f"Reason: {metric.reason}") # print(f"Score Breakdown: {metric.score_breakdown}") results.append({ "index": i, "score": metric.score, "reason": metric.reason, "score_breakdown": metric.score_breakdown, "text": text_data, "summary": generated_summary }) #Sum all scores in results and divide to nubmer of results overallScore = (sum([result["score"] for result in results]) / len(results)) * 100 print(f"Total model time: {total_model_time} seconds") print(f"Total judge time: {total_judge_time} seconds") return {"results": overallScore}