from src.deepeval.base_task import BaseTask
from src.deepeval.utils import accuracy, accuracy_standard_error
from typing import Any

class SentimentAnalysisTask(BaseTask):
    def __init__(self, model_name):
        super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name)

    def load_dataset_from_hf(self):
        print("Loading the dataset")
        dataset = super().load_dataset_from_hf()
        return dataset


    def evaluate(self) -> dict[str, Any]:
        responses = []
        total_count = len(self.dataset)
        n_correct = 0
        for row in self.dataset:
            sentence = row["sentence"]
            choices=["positive", "negative", "neutral"]
            formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)])
            prompt = f"Verilen metin hangi duyguyu ifade ediyor? {sentence}\n {formatted_choices}"
            messages = prompt
            answer = self.generate_response_mcqa_multi_token(messages, choices=choices)
            #print("Answer:", answer)
            responses.append(answer)
            correct_answer_letter = "A" if row["sentiment"] == "positive" else "B" if row["sentiment"] == "negative" else "C" if row["sentiment"] == "neutral" else None
            model_answer_cleaned = answer.strip().replace('\n', '').replace(' ', '').upper()
            if correct_answer_letter == model_answer_cleaned:
                n_correct += 1

        acc = accuracy(n_correct, total_count)
        acc_stderr = accuracy_standard_error(acc, total_count)
        return {"acc": acc, "acc_stderr": acc_stderr}