from src.deepeval.base_task import BaseTask
from collections import defaultdict
from src.deepeval.utils import accuracy, accuracy_standard_error
from typing import Any
import re
import ast
import math

class NERTask(BaseTask):
    def __init__(self, model_name):
        super().__init__("metunlp/tr_ner", model_name=model_name)

    def load_dataset_from_hf(self):
        dataset = super().load_dataset_from_hf()
        return dataset

    def generate_response_oeqa_multi_token(self, msg, max_new_tokens: int = 128):
        """
        Handles multiple-choice questions where answers might have multiple tokens.
        """
        # Ensure tokenizer has proper special tokens set
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        if self.model.config.pad_token_id is None:
            self.model.config.pad_token_id = self.tokenizer.pad_token_id

        chat = [
            {"role": "user", "content": f"{msg}"},
        ]
        formatted_chat = self.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
        # print(formatted_chat)

        inputs = self.tokenizer(formatted_chat, return_tensors="pt", padding=True, truncation=True)
        input_ids = inputs.input_ids.to(self.model.device)
        attention_mask = inputs.attention_mask.to(self.model.device)

        # Generate response with proper token limits
        output = self.model.generate(
            input_ids,
            do_sample=True,
            attention_mask=attention_mask,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.pad_token_id,
            temperature=0.7,
            top_p=0.95,
            max_new_tokens=max_new_tokens,
        )

        generated_ids = output[0]  # The generated sequence including the prompt
        generated_tokens = generated_ids[len(input_ids[0]):]  # Exclude the input_ids part
        generated_text = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)

        return generated_text

    def parse_model_output(self,pred_str):
        """
        Modelin çıktısını işleyerek, içerisindeki CSV bloktan NER etiketlerini ayıklar.
        Beklenen CSV formatı (her satır):
            kelime_grubu,NER_etiketi
        """
        # Eğer çıktı markdown kod bloğu içerisinde verilmişse, CSV kısmını ayıkla
        if "```csv" in pred_str:
            start_idx = pred_str.find("```csv")
            end_idx = pred_str.find("```", start_idx + 6)
            if end_idx != -1:
                csv_content = pred_str[start_idx + 6:end_idx].strip()
            else:
                csv_content = pred_str[start_idx + 6:].strip()
        else:
            # Eğer kod bloğu yoksa, tüm metni CSV olarak kabul et
            csv_content = pred_str.strip()

        result = set()
        # CSV içeriğindeki her satırı işle
        for line in csv_content.splitlines():
            line = line.strip()
            if not line:
                continue  # Boş satırları atla
            # Her satırın "text,NER_label" formatında olduğunu varsayalım
            if ',' in line:
                text, label = line.split(',', 1)
                result.add((text.strip(), label.strip()))
            else:
                # Eğer virgül yoksa, satırı yok sayabilir veya hata fırlatabilirsiniz
                continue
        return result

    def parse_ground_truth(self,tag_str):
        """Parses the 'tags' column from the CSV dataset."""
        return {(tag["text"], tag["label"]) for tag in ast.literal_eval(tag_str)}

    def evaluate_predictions(self,ground_truth_tags, model_predictions):
        """Computes a score between 0 and 1 based on correct labels."""
        correct = ground_truth_tags & model_predictions
        incorrect = model_predictions - ground_truth_tags
        missed = ground_truth_tags - model_predictions

        total = len(ground_truth_tags) if ground_truth_tags else 1
        score = len(correct) / total

        #print(f"Correct: {correct}, Incorrect: {incorrect}, Missed: {missed}i Total: {total}")
        return score
    
    def accuracy_standard_error(accuracy: float, n: int) -> float:
        """
        :param accuracy: accuracy of the model
        :param n: number of samples
        :return: standard deviation of accuracy of the model
        """
        return math.sqrt((accuracy * (1 - accuracy)) / n)

    def accuracy(n_correct: int, n_total: int) -> float:
        """
        :param n_correct: number of correct predictions
        :param n_total: number of total predictions
        :return: accuracy of the model
        """
        return n_correct / n_total

    def evaluate(self) -> dict[str, Any]:
        responses = []
        difficulty_results = defaultdict(lambda: {'correct': 0, 'total': 0})
        total_count = 0
        true = 0

        for row in self.dataset:
            total_count += 1

            # Get values from row
            category = "base"
            answer = row["tags"]
            sentence = row["sentence"]

            # Prints for debugging
            #print(f"Answer: {answer}")
            #print("Type of answer:", type(answer))

            # Construct the prompt/message
            instruction = """Aşağıdaki Named Entity Recognition (NER) için etiketlenmesi gereken cümleler vardır.
Cümlelerdeki varlıkları belirleyin ve şu kategorilere ayırın: CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PER, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, TITLE, WORK_OF_ART.
Varlıklar, anlamlı bilgiler içeren terimlerdir ve aşağıdaki şekilde tanımlanır:
CARDINAL: Nicelik veya sıralama belirtmeyen sayısal ifadeler.
DATE: Belirli bir tarih veya zaman ifadeleri.
EVENT: Adlandırılmış olaylar veya durumlar.
FAC: Binalar veya önemli yerler gibi tesisler.
GPE: Ülke, şehir veya eyalet gibi coğrafi-politik varlıklar.
LANGUAGE: Adlandırılmış diller.
LAW: Yasal belgeler, düzenlemeler veya kanunlar.
LOC: Coğrafi veya fiziksel konumlar (GPE dışındaki).
MONEY: Parasal değerler.
NORP: Milletler, dini veya siyasi gruplar.
ORDINAL: Sıralama veya dereceler.
ORG: Organizasyonlar veya kurumlar.
PER: Kişisel unvanlar veya sıfatlar.
PERSON: Bireylerin isimleri.
PRODUCT: Üretilen nesneler veya araçlar.
QUANTITY: Ölçülebilir miktarlar ve birimler.
TIME: Günün belirli saatleri.
TITLE: Kişi unvanları.
WORK_OF_ART: Sanat eserleri, kitaplar, müzik vb. Adlar, tarih ifadeleri, konumlar gibi belirgin bilgiler varlıktır.

Fiiller, sıfatlar, zarflar, soyut kavramlar gibi ifadeler varlık değildir. Çıktıyı aşağıdaki CSV formatında döndürün.

# Örnek
Cümle: \"Üç yıl aradan sonra gerçekleştirilen ve Karadeniz, Ege ve Akdeniz’de düzenlenecek olan tatbikata ilişkin Yunanistan'ın Kathimerini gazetesi 'Türk-Yunan: Çetin donanma dengesinin gücü' başlığını kullandı.\"

Çıktı:
Üç yıl,DATE
Karadeniz,LOC
Ege,LOC
Akdeniz,LOC
Yunanistan,GPE
Kathimerini,ORG
Türk,NORP

Her satır sadece şu formatta olmalıdır: kelime_grubu,NER_etiketi. Ekstra bir yazı veya sembol kullanmayın.
              """
            prompt = f"{instruction}\n\nCümle:\n{sentence}\n"
            message = prompt

            # Get/format answer of the model
            model_answer = self.generate_response_oeqa_multi_token(message)
            responses.append(model_answer)
            model_answer_cleaned = model_answer

            # Print answers
            #print(f"Correct Answer: {answer}")
            #print(f"Model Answer: {model_answer}")
            #print(f"Model Answer Cleaned: {model_answer_cleaned}")
            #print(f"Result: {answer == model_answer_cleaned}")
            
            # print("\n\n---\n")
            # print("\nSentence:", sentence)
            ground_truth = self.parse_ground_truth(answer)
            #print(f"Ground Truth: {ground_truth}")
            model_output = self.parse_model_output(model_answer_cleaned)
            #print(f"Model Output: {model_output}")
            # Check if correct based on metric
            sample_accuracy = self.evaluate_predictions(ground_truth, model_output)
            difficulty_results[category]['correct'] += sample_accuracy
            difficulty_results[category]['total'] += 1
            true += sample_accuracy

        # Print results categorized by difficulty
        for category, stats in difficulty_results.items():
            calculatedAccuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
            print(f"{category.capitalize()} Accuracy: {calculatedAccuracy:.2%} ({stats['correct']}/{stats['total']})")

        print("Results:", responses)
        print("Overall Accuracy:", true / total_count)
        acc = accuracy(true, total_count)
        acc_stderr = accuracy_standard_error(acc, total_count)
        return {"acc": acc, "acc_stderr": acc_stderr}