from src.deepeval.base_task import BaseTask from src.deepeval.utils import accuracy, accuracy_standard_error from typing import Any class SentimentAnalysisTask(BaseTask): def __init__(self, model_name): super().__init__("metunlp/sentiment_analysis_tr", model_name=model_name) def load_dataset_from_hf(self): print("Loading the dataset") dataset = super().load_dataset_from_hf() return dataset def evaluate(self) -> dict[str, Any]: responses = [] total_count = len(self.dataset) n_correct = 0 for row in self.dataset: sentence = row["sentence"] choices=["positive", "negative", "neutral"] formatted_choices = "\n".join([f"{chr(65+i)}: {choice}" for i, choice in enumerate(choices)]) prompt = f"Verilen metin hangi duyguyu ifade ediyor? {sentence}\n {formatted_choices}" messages = prompt answer = self.generate_response_mcqa_multi_token(messages, choices=choices) #print("Answer:", answer) responses.append(answer) correct_answer_letter = "A" if row["sentiment"] == "positive" else "B" if row["sentiment"] == "negative" else "C" if row["sentiment"] == "neutral" else None model_answer_cleaned = answer.strip().replace('\n', '').replace(' ', '').upper() if correct_answer_letter == model_answer_cleaned: n_correct += 1 acc = accuracy(n_correct, total_count) acc_stderr = accuracy_standard_error(acc, total_count) return {"acc": acc, "acc_stderr": acc_stderr}