from src.deepeval.base_task import BaseTask
from collections import defaultdict
from src.deepeval.utils import accuracy, accuracy_standard_error
from typing import Any
import re
import ast

class POSTask(BaseTask):
    def __init__(self, model_name):
        super().__init__("metunlp/tr_pos", model_name=model_name)

    def load_dataset_from_hf(self):
        dataset = super().load_dataset_from_hf()
        return dataset

    def generate_response_oeqa_multi_token(self, msg,max_new_tokens: int = 400):
        """
        Handles multiple-choice questions where answers might have multiple tokens.
        """
        # Ensure tokenizer has proper special tokens set
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        if self.model.config.pad_token_id is None:
            self.model.config.pad_token_id = self.tokenizer.pad_token_id

        chat = [
            {"role": "user", "content": f"{msg}"},
        ]
        formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
        #print(formatted_chat)

        inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
        input_ids = inputs.input_ids.to(self.model.device)
        attention_mask = inputs.attention_mask.to(self.model.device)

        # Generate response with proper token limits
        output = self.model.generate(
            input_ids,
            do_sample=True,
            attention_mask=attention_mask,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.pad_token_id,
            temperature=0.7,
            top_p=0.95,
            max_new_tokens=max_new_tokens,
        )

        generated_ids = output[0]  # The generated sequence including the prompt
        generated_tokens = generated_ids[len(input_ids[0]):]  # Exclude the input_ids part
        generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)

        return generated_text

    def parse_model_output_pos(self,pred_str):
        """
        Modelin çıktısından yalnızca CSV formatındaki satırları ayıklar.
        Beklenen format: her satır "kelime,POS_etiketi" şeklinde.
        """
        if "```csv" in pred_str:
            start_idx = pred_str.find("```csv")
            end_idx = pred_str.find("```", start_idx + 6)
            if end_idx != -1:
                csv_content = pred_str[start_idx + 6:end_idx].strip()
            else:
                csv_content = pred_str[start_idx + 6:].strip()
        else:
            csv_content = pred_str.strip()

        result = set()
        for line in csv_content.splitlines():
            line = line.strip()
            if not line:
                continue
            if ',' in line:
                parts = line.split(',', 1)
                word = parts[0].strip()
                pos = parts[1].strip()
                result.add((word, pos))
        return result

    def parse_ground_truth_pos(self,token_str):
        """
        CSV'deki tokens sütunundaki ground truth verisini Python literaline dönüştürür.
        Bazı durumlarda çift çift tırnak içeren alanlar olabileceğinden, bunları düzeltir.
        Örneğin: ""Bölgesi'nin"" → "Bölgesi'nin"
        """
        fixed_token_str = token_str.replace('""', '"')
        try:
            tokens_list = ast.literal_eval(fixed_token_str)
        except Exception as e:
            print("Tokens parse hatası:", e)
            tokens_list = []
        return {(token["text"], token["pos"]) for token in tokens_list}

    def evaluate_predictions(self,ground_truth_tags, model_predictions):
        """Computes a score between 0 and 1 based on correct labels."""
        correct = ground_truth_tags & model_predictions
        incorrect = model_predictions - ground_truth_tags
        missed = ground_truth_tags - model_predictions

        total = len(ground_truth_tags) if ground_truth_tags else 1
        score = len(correct) / total

        # print("-" * 20)
        # print(f"Correct: {correct}, Incorrect: {incorrect}, Missed: {missed}i Total: {total}")
        # print("-" * 20)
        # print(f"Counts: Correct: {len(correct)}, Incorrect: {len(incorrect)}, Missed: {len(missed)}, Total: {total}")
        return score

    def evaluate(self) -> dict[str, Any]:
        responses = []
        difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
        total_count = 0
        true = 0

        for row in self.dataset:
            total_count += 1

            # Get values from row
            category = "base"
            answer = row["tokens"]
            sentence = row["sentence"]

            # Prints for debugging
            #print(f"Answer: {answer}")
            #print("Type of answer:", type(answer))

            # Construct the prompt/message
            instruction = """
Bir dilbilimci gibi hareket ederek, herhangi bir düzeltme veya değişiklik yapmadan verilen cümlenin dilbilgisel analizini yapın.
Her kelimenin POS (Part-of-Speech) etiketini belirleyin ve sonuçları belirtilen formatta döndürün.

Kullanılabilecek etiketler ve açıklamaları:
ADP: Edatlar (ör. ile, için),
ADV: Zarflar (ör. en),
AUX: Yardımcı fiiller (ör. olan),
CCONJ: Eş bağlaçlar (ör. ve, ama),
DET: Belirleyiciler (ör. bu, bir),
INTJ: Ünlemler (ör. ah, hey),
NOUN: İsimler (ör. ev, kitap),
NUM: Sayılar (ör. bir, iki),
PRON: Zamirler (ör. o, biz),
PROPN: Özel isimler (ör. Ahmet, İstanbul),
PUNCT: Noktalama işaretleri (ör. ., ?, !),
SCONJ: Alt bağlaçlar (ör. çünkü, eğer),
VERB: Fiiller (ör. koşmak, yazmak).

# Örnek
Cümle: \"Ali topu tut.\"

Çıktı:
Ali, PROPN
topu, NOUN
tut, VERB
., PUNCT

Aşağıda verilen cümledeki varlıkları yukarıdaki tanımlamalara uygun olarak belirleyin. Sadece istenen formatta cevap verin: kelime,POS_etiketi. Ekstra bir yazı veya sembol kullanmayın.
            """
            prompt = f"{instruction}\n\nCümle:\n{sentence}\n"
            message = prompt

            # Get/format answer of the model
            model_answer = self.generate_response_oeqa_multi_token(message)
            responses.append(model_answer)
            model_answer_cleaned = model_answer

            # Print answers
            #print(f"Correct Answer: {answer}")
            #print(f"Model Answer: {model_answer}")
            #print(f"Model Answer Cleaned: {model_answer_cleaned}")
            #print(f"Result: {answer == model_answer_cleaned}")

            # Check if correct based on metric
            # print("\n\n---\n")
            # print("\nSentence:", sentence)
            ground_truth = self.parse_ground_truth_pos(answer)
            #print("Ground Truth:", ground_truth)
            model_output = self.parse_model_output_pos(model_answer_cleaned)  # Assume column name is 'model_prediction'
            #print("Model Output:", model_output)

            # Her örnek için doğru tahmin oranını hesapla ve ekle
            sample_accuracy = self.evaluate_predictions(ground_truth, model_output)
            difficulty_results[category]['correct'] += sample_accuracy
            difficulty_results[category]['total'] += 1
            true += sample_accuracy  # Burada true değişkenini güncelliyoruz.

        difficulty_results[category]['total'] += 1

        # Print results categorized by difficulty
        for category, stats in difficulty_results.items():
            calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
            print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")

        print("Results:", responses)
        print("Overall Accuracy:", true / total_count)
        acc = accuracy(true, total_count)
        acc_stderr = accuracy_standard_error(acc, total_count)
        return {"acc": acc, "acc_stderr": acc_stderr}