import string
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
import torch


def preprocess_text(text, remove_stopwords=True, use_lemmatization=True):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]
    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]
    if use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens


def get_special_chars():
    import emoji  # Use version emoji==1.6.1, otherwise it won't have UNICODE_EMOJI

    main_special_characters = string.punctuation + string.digits + string.whitespace
    other_special_characters = (
        "    　    ￼’“”–ー一▬…✦�­£​•€«»°·═"
        "×士＾˘⇓↓↑←→（）§″′´¿−±∈﻿¢ø‚„½¼¾¹²³―⁃，ˌ¸‹›ʺˈʻ¦‐⠀‰‑≤≥‖"
        "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン：∼⁄・♡✓⊕․．⋅÷１‟；،、¨ाাी्े◦˚"
        "゜ʼ≖ʼ¤ッツシ℃√！【】‿∞➤～πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬？▷Г♫∟™ª₪®「—❖"
        "」﴾》"
    )
    emoji = list(emoji.UNICODE_EMOJI["en"].keys())
    special_characters_default = set(main_special_characters + other_special_characters)
    special_characters_default.update(emoji)
    return special_characters_default

special_characters_default = get_special_chars()


# -------------------- Features --------------------
def syllable_count(word, d):
    return [len(list(y for y in x if y[-1].isdigit())) for x in d.get(word, [])]


def estimated_slightly_difficult_words_ratio(text, d):
    words = word_tokenize(text.lower())
    total_words = len(words)
    # Considering words with 3 or more syllables as difficult
    difficult_count = sum(
        1 for word in words if sum(1 for _ in syllable_count(word, d)) >= 2
    )
    return difficult_count / total_words if total_words > 0 else 0


# -------------------- Features --------------------
def entity_density(text, nlp):
    doc = nlp(text)
    return len(doc.ents) / len(doc)


# -------------------- Features --------------------
def determiners_frequency(text, nlp):
    doc = nlp(text)
    determiners = sum(1 for token in doc if token.pos_ == "DET")
    total_words = len(doc)
    return determiners / total_words if total_words else 0


# -------------------- Features --------------------
def punctuation_diversity(text):
    punctuation_counts = Counter(
        char for char in text if char in special_characters_default
    )
    diversity_score = (
        len(punctuation_counts) / len(special_characters_default)
        if special_characters_default
        else 0
    )
    return diversity_score


# -------------------- Features --------------------
def type_token_ratio(text, remove_stopwords=True, use_lemmatization=True):
    tokens = preprocess_text(text, remove_stopwords, use_lemmatization)
    unique_words = set(tokens)
    return len(unique_words) / len(tokens) if tokens else 0


# -------------------- Features --------------------
def hapax_legomena_ratio(text, remove_stopwords=True, use_lemmatization=True):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]

    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]

    if use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

    freq_dist = FreqDist(tokens)
    hapaxes = freq_dist.hapaxes()
    return len(hapaxes) / len(tokens) if tokens else 0


# -------------------- Features --------------------
def mtld(text, threshold=0.72, remove_stopwords=True, use_lemmatization=True):
    tokens = preprocess_text(text, remove_stopwords, use_lemmatization)

    def mtld_calc(direction):
        token_length, factor_count = 0, 0
        types = set()
        for token in tokens if direction == "forward" else reversed(tokens):
            types.add(token)
            token_length += 1
            if len(types) / token_length < threshold:
                factor_count += 1
                types = set()
                token_length = 0
        factor_count += 1  # For the last segment, even if it didn't reach the threshold
        return len(tokens) / factor_count if factor_count != 0 else 0

    return (mtld_calc("forward") + mtld_calc("backward")) / 2


# -------------------- Features --------------------
def calculate_max_depth(sent):
    return max(len(list(token.ancestors)) for token in sent)


def calculate_syntactic_tree_depth(text, nlp):
    doc = nlp(text)
    sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
    average_depth = (
        sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
    )
    return average_depth


# -------------------- Features --------------------
def calculate_perplexity(text, model, tokenizer, device, stride=512):
    encodings = tokenizer(text, return_tensors="pt")
    max_length = model.config.n_positions
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    return ppl.item()