from ts.torch_handler.base_handler import BaseHandler
from nltk.chunk import conlltags2tree
from nltk import pos_tag
from nltk.tree import Tree
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json
import string
# Get the directory of your script
import logging
import os
import sys
logger = logging.getLogger(__name__)

# get the current directory
current_directory = os.path.dirname(os.path.realpath(__file__))
print(current_directory)
# add the current directory to sys.path
sys.path.insert(0, current_directory)
import pickle
def pickle_load(path, verbose=False):
    if path is None:
        return None
    if verbose:
        print('Loading {}'.format(path))
    with open(path, "rb") as f:
        obj = pickle.load(f)
    return obj

DEFAULT_MODEL = 'facebook/mgenre-wiki'

def tokenize(text):
    # Add a space before and after specified punctuation marks
    # text = re.sub(r'([,.!?])', r' \1 ', text)
    # Split the text into tokens
    tokens = text.split()
    return tokens

logger.info(f'Loading title2wikidataID')
lang_title2wikidataID_path = "lang_title2wikidataID-normalized_with_redirect.pkl"
lang_title2wikidataID = pickle_load(
        lang_title2wikidataID_path, verbose=True)

def text_to_id(x):
    return max(lang_title2wikidataID[tuple(
        reversed([y.strip() for y in x.split(" >> ")]))], key=lambda y: int(y[1:]))

"""
Method for retrieving the Qid
"""

def get_wikidata_qid(wikipedia_titles, scores):
    qid = 'NIL'
    wikipedia_title = wikipedia_titles[0]
    score = scores[0]
    for idx, title in enumerate(
            wikipedia_titles):
        try:
            qid = text_to_id(title)
            wikipedia_title = wikipedia_titles[idx]
            score = scores[idx]
            return qid, wikipedia_title, score
        except BaseException:
            qid = 'NIL'
    return qid, wikipedia_title, score


def get_entities(tokens, preds_list_coarse, preds_list_fine, coarse_confidences, fine_confidences):
    tags_coarse = [tag.replace('S-', 'B-').replace('E-', 'I-') for tag in preds_list_coarse]
    tags_fine = [tag.replace('S-', 'B-').replace('E-', 'I-') for tag in preds_list_fine]
    pos_tags = [pos for token, pos in pos_tag(tokens)]

    conll_coarse_tags = [(token, pos, tg)
                         for token, pos, tg in zip(tokens, pos_tags, tags_coarse)]
    conll_fine_tags = [(token, pos, tg)
                       for token, pos, tg in zip(tokens, pos_tags, tags_fine)]

    ne_tree_coarse = conlltags2tree(conll_coarse_tags)
    ne_tree_fine = conlltags2tree(conll_fine_tags)

    coarse_entities = get_entities_from_tree(ne_tree_coarse, coarse_confidences)
    fine_entities = get_entities_from_tree(ne_tree_fine, fine_confidences)
    return coarse_entities, fine_entities


def logarithmic_scaling(confidence_score):
    return np.log(confidence_score + 1e-10)  # Adding a small value to avoid log(0)


def classify_confidence(confidence_score):
    return int(confidence_score * 100.0)
    # TypeError: Object of type float32 is not JSON serializable
    # if confidence_score > 0.95:
    #     return 'high'
    # elif confidence_score > 0.75:
    #     return 'medium'
    # else:
    #     return 'low'

def get_entities_from_tree(ne_tree, token_confidences):
    entities = []
    idx = 0
    char_position = 0  # This will hold the current character position

    for subtree in ne_tree:
        # skipping 'O' tags
        if isinstance(subtree, Tree):
            original_label = subtree.label()
            original_string = " ".join(
                [token for token, pos in subtree.leaves()])

            # original_string = reconstruct_text([token for token, pos in subtree.leaves()])

            entity_start_position = char_position
            entity_end_position = entity_start_position + len(original_string)

            confidences = token_confidences[idx:idx + len(subtree)]
            # Compute the average confidence
            avg_confidence = sum(confidences) / len(confidences)
            print(original_string, '- confidence -', token_confidences[idx:idx + len(subtree)], '- avg -',
                  avg_confidence, classify_confidence(avg_confidence), '- label -', original_label)

            entities.append(
                (original_string,
                 original_label,
                 (idx,
                  idx + len(subtree)),
                 (entity_start_position,
                  entity_end_position),
                 classify_confidence(avg_confidence)))

            idx += len(subtree)

            # Update the current character position
            # We add the length of the original string + 1 (for the space)
            char_position += len(original_string) + 1
        else:
            token, pos = subtree
            # If it's not a named entity, we still need to update the character
            # position
            char_position += len(token) + 1  # We add 1 for the space
            idx += 1
    return entities


def realign(
        text_sentence,
        tokens_coarse_result,
        tokens_fine_result,
        coarse_confidences,
        fine_confidences,
        tokenizer,
        language,
        nerc_coarse_label_map,
        nerc_fine_label_map):

    preds_list_coarse, preds_list_fine, words_list, coarse_confidences_list, fine_confidences_list = [], [], [], [], []
    word_ids = tokenizer(text_sentence, is_split_into_words=True).word_ids()

    for idx, word in enumerate(text_sentence):
        try:
            beginning_index = word_ids.index(idx)
            preds_list_coarse.append(nerc_coarse_label_map[tokens_coarse_result[beginning_index]])
            preds_list_fine.append(nerc_fine_label_map[tokens_fine_result[beginning_index]])

            coarse_confidences_list.append(coarse_confidences[beginning_index])
            fine_confidences_list.append(fine_confidences[beginning_index])

        except Exception as ex:  # the sentence was longer then max_length
            preds_list_coarse.append('O')
            preds_list_fine.append('O')

            coarse_confidences_list.append(1.0)
            fine_confidences_list.append(1.0)

        words_list.append(word)

    return words_list, preds_list_coarse, preds_list_fine, coarse_confidences_list, fine_confidences_list

import os


class NewsAgencyHandler(BaseHandler):
    def __init__(self):
        super().__init__()
        self.model = None
        self.tokenizer = None
        self.device = None

    def initialize(self, ctx):
        # boilerplate
        properties = ctx.system_properties
        self.map_location = "cuda" if torch.cuda.is_available() else "cpu"

        self.device = torch.device(self.map_location + ":" + str(
            properties.get("gpu_id")) if torch.cuda.is_available() else self.map_location)

        # self.manifest = ctx.manifest
        # model_dir is the inside of your archive!
        # extra-files are in this dir.


        model_name = ctx.model_yaml_config["handler"]["model_name"]
        logger.info("Model %s loading tokenizer", model_name)

        # serialized_file = self.manifest["model"]["serializedFile"]

        # self.tokenizer = AutoTokenizer.from_pretrained(
        #     model_dir, local_files_only=True)
        #
        # # Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode
        # # further setup config can be added.
        logger.error(f'getcwd:      {os.getcwd()}')
        logger.error(f'__file__:    {__file__}')
        logger.error(f'Model:    {model_name}')
        logger.error(f'Device:    {self.device}')
        #
        # save_mode = "pretrained"
        #
        # if save_mode == "torchscript":
        #     self.model = torch.jit.load(serialized_file)
        # elif save_mode == "pretrained":
        # model_dir = properties.get("model_dir")
        # serialized_file = self.manifest["model"]["serializedFile"]
        # self.tokenizer = AutoTokenizer.from_pretrained(
        #     model_dir, local_files_only=True)
        #
        # self.model = torch.jit.load(serialized_file, map_location=self.device)
        #
        # self.model.to(self.device)
        # self.model.eval()

        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        # self.model = torch.nn.DataParallel(self.model)

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # else:
        #     logger.warning("Missing the checkpoint or state_dict.")
        self.model.to(self.map_location)
        self.model.eval()
        logger.info("Transformer model from path %s loaded successfully", model_name)

    def preprocess(self, requests):
        logger.info(f'Preprocessing requests {len(requests)}')

        data = requests[0]
        text_sentences = []

        # The request should have the text:
        # THE next MEETLNG of the TRITSTEE, will be held at the [START] LONDON HOTEL [END] in POOLE, on ldomaT,
        # the 12th day or MARCH next. at 12 oClock at Noon

        for item in data['body']:
            item = json.loads(item)
            text = item['text']
            text_sentences.append(text)
            language = item['language']
            # print('Doc id:', item['doc_id'])
            # print('-----Text', text, type(text))
            # print('-----Language', language)

        return text_sentences, language

    def inference(self, inputs):

        text_sentences, language = inputs

        tokens_coarse_results, tokens_fine_results = [], []
        tokens_coarse_confidences, tokens_fine_confidences = [], []

        qids = []
        with torch.no_grad():
            for sentence in text_sentences:

                sentences = [sentence]

                # logger.error(f'Device:    {self.device}')

                outputs = self.model.generate(
                    **self.tokenizer(sentences, return_tensors="pt").to(self.device),
                    num_beams=5,
                    num_return_sequences=5,
                    return_dict_in_generate=True,
                    output_scores=True)

                token_ids, scores = outputs['sequences'], outputs['sequences_scores']
                wikipedia_titles = self.tokenizer.batch_decode(token_ids, skip_special_tokens=True)

                # Example log-likelihoods (scores)
                log_likelihoods = torch.tensor(scores)

                # Convert log-likelihoods to "probabilities" (not true probabilities)
                probabilities = torch.exp(log_likelihoods)

                # Normalize these probabilities so they sum to 1
                normalized_probabilities = probabilities / torch.sum(probabilities)

                # Convert to percentages
                percentages = normalized_probabilities * 100

                qid, wikipedia_title, score = get_wikidata_qid(wikipedia_titles, percentages)
                percentage_score = int(score)

                # logger.info(f"Model prediction: {wikipedia_titles} {qid}, {wikipedia_title}, {score}, "
                #             f"---- {percentage_score}")

                qids.append({'qid': qid, 'wikipedia_title': wikipedia_title, 'score': percentage_score})
                # logger.info('-' * 100)

        return qids, text_sentences, language

    def postprocess(self, outputs):
        # postprocess the outputs here, for example, convert predictions to labels

        qids, text_sentences, language = outputs

        logger.info(f'Result NEL: {qids}')

        return [[qids]]