from datetime import datetime
from src.deepeval.base_task import BaseTask
from deepeval.test_case import LLMTestCase
from typing import Any
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams

class ReadingComprehensionTask(BaseTask):
    def __init__(self, model_name: str):
        super().__init__("metunlp/reading_comp_oe", model_name=model_name)

        self.correctness_metric = GEval(
            name="readingcomprehension",
            criteria="Determine whether the actual output is factually correct based on the expected output.",
            evaluation_steps=[
                "Is the answer correct according to the context?",
                "Does the answer focus on the question using the given context (no unsupported info)?",
                "Does the answer address all parts of the question?",
                "Is the answer internally coherent and plausible?",
                "Is the answer well-written?"
            ],
            model="gpt-4o-mini",
            evaluation_params=[
                LLMTestCaseParams.INPUT,
                LLMTestCaseParams.ACTUAL_OUTPUT,
                LLMTestCaseParams.EXPECTED_OUTPUT
            ],
        )

    def load_dataset_from_hf(self):
        dataset = super().load_dataset_lmjudge_from_hf()
        return dataset

    def evaluate(self) -> dict[str, Any]:
        results = []
        total_model_time = 0
        total_judge_time = 0

        for i, row in enumerate(self.dataset):
            start_model = datetime.now()
            text = str(row.get("text", ""))
            question = str(row.get("question_about_the_text", ""))
            expected_answer = str(row.get("answer", ""))

            prompt = (
                f"Verilen paragrafa bakarak aşağıdaki soruyu cevaplayın:\n\n"
                f"Paragraf: {text}\n\n"
                f"Soru: {question}"
            )

            answer = self.generate_response(prompt, max_new_tokens=150)
            end_model = datetime.now()
            total_model_time += (end_model - start_model).total_seconds()

            start_judge = datetime.now()
            test_case = LLMTestCase(
                input=question,
                actual_output=answer,
                expected_output=expected_answer
            )

            self.correctness_metric.measure(test_case)
            end_judge = datetime.now()
            total_judge_time += (end_judge - start_judge).total_seconds()

            results.append({
                "index": i,
                "score": self.correctness_metric.score,
                "reason": self.correctness_metric.reason,
                "input": question,
                "expected_output": expected_answer,
                "actual_output": answer
            })
            #Sum all scores in results and divide to nubmer of results
            overallScore = (sum([result["score"] for result in results]) / len(results)) * 100 
            
        print(f"Total model time: {total_model_time} seconds")
        print(f"Total judge time: {total_judge_time} seconds")
        return {"results": overallScore}