import re import nltk import numpy as np import gensim import spacy import math from collections import Counter try: from src.clean import clean_license_text from src.read_data import read_file except: from clean import clean_license_text from read_data import read_file NEGATION_WEIGHT = 0.2 nlp = spacy.load("en_core_web_sm") modal_verbs = { "can", "may", "must", "shall", "will", # "could", # "might", "should", "would" } neg_modal = { "cannot", "may not", "must not", "shall not", "will not", # "could not", # "might not", "should not", "would not" } # TODO Move these structures to another file license_stopwords = { ",", "(", ")", "software", "license", "work", # "copyright", "program", # "use", # "copy", "source", # "may", # "terms", "code", # "without", # "free", # "distribute", # "rights", # "notice", # "shall", "provided", # "permission", # "including", "version", "library", # "condition", "covered", # "must", "public", # "modify", # "distribution", # "warranty", }.union(nlp.Defaults.stop_words) - modal_verbs negation_words = { "no", "not", "non" } # TODO: Consider adding these words to the vocab: # no-charge # # # # verbs = [ "permit", "copy", "modify", "change", "sell", "reproduce", "transfer", "rent", "lease", "assign", "sublet", "distribute", "redistribute", "allow", "require", "merge", "publish", "use", "include", "grant", "run", "affirm", "propagate", "acknowledge" ] neg_verbs = [f"not-{verb}" for verb in verbs] properties_dict = { "0.1": [ ], "0.2": ["everyone"], "0.3": ["irrevocable"], "0.4": [], "0.5": [], "0.6": [ "distribution", "redistribution", "permission", "modification", "copyright", "permission", "limitation", "free", "charge", "warranty", "term", "terms", "condition", "right", "sublicense", "commercial", "non-commercial", "exception" ], "0.7": verbs + [ ], "0.8": [], "0.9": neg_verbs + [], "1.0": [], "3.0": modal_verbs } properties_scores = { "0.1": 0.1, "0.2": 0.2, "0.3": 0.3, "0.4": 0.4, "0.5": 0.5, "0.6": 0.6, "0.7": 0.7, "0.8": 0.8, "0.9": 0.9, "1.0": 1.0, "3.0": 3.0 } def lemmatize_tokens(sent): # TODO: Docstrings """Each word in input sentence is converted to lemma""" lemmas = list() nlp_sent = [token.lemma_.lower().strip() for token in nlp(sent)] for tok_i, token in enumerate(nlp_sent): if (token and token not in license_stopwords and token not in negation_words): if tok_i > 0 and nlp_sent[tok_i-1] in negation_words: lemmas.append(f"{nlp_sent[tok_i-1]}-{token}") elif tok_i > 1 and nlp_sent[tok_i-1] in " -" and nlp_sent[tok_i-2] in negation_words: lemmas.append(f"{nlp_sent[tok_i-2]}-{token}") else: lemmas.append(token) return lemmas def custom_textrank_summarizer(license_text, min_sent_len=3, summary_len=0.3, debug=False): """ TODO: Doctrings """ sent_scores = Counter() cleaned_license_text, definitions = clean_license_text(license_text) cleaned_license_sentences = re.split('(\n{2,}|\.)', cleaned_license_text) cleaned_license_sentences = [ text.strip() for text in cleaned_license_sentences if text.strip() not in ["", ".", "\n", "\n\n"] ] summary_len = math.ceil(summary_len*len(cleaned_license_sentences)) if debug: print(f"summary length:{summary_len}") print(cleaned_license_sentences) for sent_i, sent in enumerate(cleaned_license_sentences): if sent_i < 0: continue if len(sent.split()) < min_sent_len: continue score = 0 lemmatized_tokens = lemmatize_tokens(sent) if debug: print("-"*50) print(f"\nOriginal Sentence = {sent}") print(f"\n{sent_i}. Lemmatized_tokens = {lemmatized_tokens}") word_count = Counter([tok for tok in lemmatized_tokens]) for prop, prop_words in properties_dict.items(): prop_score = 0 imp_words = list() for prop_i, prop_word in enumerate(prop_words): if prop_word in word_count.keys(): prop_score += properties_scores[prop] imp_words.append(prop_word) if debug: print(prop, "=", imp_words, "=", prop_score) score += prop_score sent_scores[sent] = score / len(lemmatized_tokens) if debug: print(f"Sentence score: {sent_scores[sent]}") print() if debug: print(sent_scores) sorted_sent_scores = sent_scores.most_common() summary = ".\n".join(sent for sent, score in sorted_sent_scores[:summary_len]) return summary, definitions