import re
import nltk
import numpy as np
import gensim
import spacy
import math
from collections import Counter

try:
    from src.clean import clean_license_text
    from src.read_data import read_file
except:
    from clean import clean_license_text
    from read_data import read_file


NEGATION_WEIGHT = 0.2

nlp = spacy.load("en_core_web_sm")

modal_verbs = {
    "can",
    "may",
    "must",
    "shall",
    "will",
    # "could",
    # "might",
    "should",
    "would"
}

neg_modal = {
    "cannot",
    "may not",
    "must not",
    "shall not",
    "will not",
    # "could not",
    # "might not",
    "should not",
    "would not"
}

# TODO Move these structures to another file
license_stopwords = {
    ",",
    "(",
    ")",
    "software",
    "license",
    "work",
    # "copyright",
    "program",
    # "use",
    # "copy",
    "source",
    # "may",
    # "terms",
    "code",
    # "without",
    # "free",
    # "distribute",
    # "rights",
    # "notice",
    # "shall",
    "provided",
    # "permission",
    # "including",
    "version",
    "library",
    # "condition",
    "covered",
    # "must",
    "public",
    # "modify",
    # "distribution",
    # "warranty",
}.union(nlp.Defaults.stop_words) - modal_verbs

negation_words = {
    "no",
    "not",
    "non"
}

# TODO: Consider adding these words to the vocab:
    # no-charge
    # 
    # 
    # 
    # 

verbs = [
    "permit", "copy", "modify", "change", "sell", "reproduce",
    "transfer", "rent", "lease", "assign", "sublet", "distribute",
    "redistribute", "allow", "require", "merge", "publish", "use",
    "include", "grant", "run", "affirm", "propagate", "acknowledge"
]

neg_verbs = [f"not-{verb}" for verb in verbs]

properties_dict = {
    "0.1": [
    ],
    "0.2": ["everyone"],
    "0.3": ["irrevocable"],
    "0.4": [],
    "0.5": [],
    "0.6": [
        "distribution", "redistribution",
        "permission", "modification",
        "copyright",
        "permission",
        "limitation",
        "free", "charge",
        "warranty",
        "term", "terms", "condition",
        "right",
        "sublicense",
        "commercial", "non-commercial",
        "exception"
    ],
    "0.7": verbs + [

    ],
    "0.8": [],
    "0.9": neg_verbs + [],
    "1.0": [],
    "3.0": modal_verbs
}


properties_scores = {
    "0.1": 0.1,
    "0.2": 0.2,
    "0.3": 0.3,
    "0.4": 0.4,
    "0.5": 0.5,
    "0.6": 0.6,
    "0.7": 0.7,
    "0.8": 0.8,
    "0.9": 0.9,
    "1.0": 1.0,
    "3.0": 3.0
}


def lemmatize_tokens(sent):
    # TODO: Docstrings
    """Each word in input sentence is converted to lemma"""
    lemmas = list()

    nlp_sent = [token.lemma_.lower().strip() for token in nlp(sent)]

    for tok_i, token in enumerate(nlp_sent):
        if (token
            and token not in license_stopwords
            and token not in negation_words):
            if tok_i > 0 and nlp_sent[tok_i-1] in negation_words:
                lemmas.append(f"{nlp_sent[tok_i-1]}-{token}")
            elif tok_i > 1 and nlp_sent[tok_i-1] in " -" and nlp_sent[tok_i-2] in negation_words:
                lemmas.append(f"{nlp_sent[tok_i-2]}-{token}")
            else:
                lemmas.append(token)

    return lemmas


def custom_textrank_summarizer(license_text,
                               min_sent_len=3,
                               summary_len=0.3,
                               debug=False):
    """
    TODO: Doctrings
    """
    sent_scores = Counter()

    cleaned_license_text, definitions = clean_license_text(license_text)

    cleaned_license_sentences = re.split('(\n{2,}|\.)', cleaned_license_text)
    cleaned_license_sentences = [
        text.strip() for text in cleaned_license_sentences
        if text.strip() not in ["", ".", "\n", "\n\n"]
    ]

    summary_len = math.ceil(summary_len*len(cleaned_license_sentences))

    if debug:
        print(f"summary length:{summary_len}")
        print(cleaned_license_sentences)

    for sent_i, sent in enumerate(cleaned_license_sentences):

        if sent_i < 0:
            continue

        if len(sent.split()) < min_sent_len:
            continue

        score = 0

        lemmatized_tokens = lemmatize_tokens(sent)

        if debug:
            print("-"*50)
            print(f"\nOriginal Sentence = {sent}")
            print(f"\n{sent_i}. Lemmatized_tokens = {lemmatized_tokens}")

        word_count = Counter([tok for tok in lemmatized_tokens])

        for prop, prop_words in properties_dict.items():
            prop_score = 0

            imp_words = list()

            for prop_i, prop_word in enumerate(prop_words):
                if prop_word in word_count.keys():
                    prop_score += properties_scores[prop] 
                    imp_words.append(prop_word)

            if debug:
                print(prop, "=", imp_words, "=", prop_score)

            score += prop_score

        sent_scores[sent] = score / len(lemmatized_tokens)

        if debug:
            print(f"Sentence score: {sent_scores[sent]}")
            print()

    if debug:
        print(sent_scores)

    sorted_sent_scores = sent_scores.most_common()

    summary = ".\n".join(sent for sent, score in sorted_sent_scores[:summary_len])

    return summary, definitions