Spaces:
Runtime error
Runtime error
import re | |
import nltk | |
import numpy as np | |
import gensim | |
import spacy | |
import math | |
from collections import Counter | |
try: | |
from src.clean import clean_license_text | |
from src.read_data import read_file | |
except: | |
from clean import clean_license_text | |
from read_data import read_file | |
NEGATION_WEIGHT = 0.2 | |
nlp = spacy.load("en_core_web_sm") | |
modal_verbs = { | |
"can", | |
"may", | |
"must", | |
"shall", | |
"will", | |
# "could", | |
# "might", | |
"should", | |
"would" | |
} | |
neg_modal = { | |
"cannot", | |
"may not", | |
"must not", | |
"shall not", | |
"will not", | |
# "could not", | |
# "might not", | |
"should not", | |
"would not" | |
} | |
# TODO Move these structures to another file | |
license_stopwords = { | |
",", | |
"(", | |
")", | |
"software", | |
"license", | |
"work", | |
# "copyright", | |
"program", | |
# "use", | |
# "copy", | |
"source", | |
# "may", | |
# "terms", | |
"code", | |
# "without", | |
# "free", | |
# "distribute", | |
# "rights", | |
# "notice", | |
# "shall", | |
"provided", | |
# "permission", | |
# "including", | |
"version", | |
"library", | |
# "condition", | |
"covered", | |
# "must", | |
"public", | |
# "modify", | |
# "distribution", | |
# "warranty", | |
}.union(nlp.Defaults.stop_words) - modal_verbs | |
negation_words = { | |
"no", | |
"not", | |
"non" | |
} | |
# TODO: Consider adding these words to the vocab: | |
# no-charge | |
# | |
# | |
# | |
# | |
verbs = [ | |
"permit", "copy", "modify", "change", "sell", "reproduce", | |
"transfer", "rent", "lease", "assign", "sublet", "distribute", | |
"redistribute", "allow", "require", "merge", "publish", "use", | |
"include", "grant", "run", "affirm", "propagate", "acknowledge" | |
] | |
neg_verbs = [f"not-{verb}" for verb in verbs] | |
properties_dict = { | |
"0.1": [ | |
], | |
"0.2": ["everyone"], | |
"0.3": ["irrevocable"], | |
"0.4": [], | |
"0.5": [], | |
"0.6": [ | |
"distribution", "redistribution", | |
"permission", "modification", | |
"copyright", | |
"permission", | |
"limitation", | |
"free", "charge", | |
"warranty", | |
"term", "terms", "condition", | |
"right", | |
"sublicense", | |
"commercial", "non-commercial", | |
"exception" | |
], | |
"0.7": verbs + [ | |
], | |
"0.8": [], | |
"0.9": neg_verbs + [], | |
"1.0": [], | |
"3.0": modal_verbs | |
} | |
properties_scores = { | |
"0.1": 0.1, | |
"0.2": 0.2, | |
"0.3": 0.3, | |
"0.4": 0.4, | |
"0.5": 0.5, | |
"0.6": 0.6, | |
"0.7": 0.7, | |
"0.8": 0.8, | |
"0.9": 0.9, | |
"1.0": 1.0, | |
"3.0": 3.0 | |
} | |
def lemmatize_tokens(sent): | |
# TODO: Docstrings | |
"""Each word in input sentence is converted to lemma""" | |
lemmas = list() | |
nlp_sent = [token.lemma_.lower().strip() for token in nlp(sent)] | |
for tok_i, token in enumerate(nlp_sent): | |
if (token | |
and token not in license_stopwords | |
and token not in negation_words): | |
if tok_i > 0 and nlp_sent[tok_i-1] in negation_words: | |
lemmas.append(f"{nlp_sent[tok_i-1]}-{token}") | |
elif tok_i > 1 and nlp_sent[tok_i-1] in " -" and nlp_sent[tok_i-2] in negation_words: | |
lemmas.append(f"{nlp_sent[tok_i-2]}-{token}") | |
else: | |
lemmas.append(token) | |
return lemmas | |
def custom_textrank_summarizer(license_text, | |
min_sent_len=3, | |
summary_len=0.3, | |
debug=False): | |
""" | |
TODO: Doctrings | |
""" | |
sent_scores = Counter() | |
cleaned_license_text, definitions = clean_license_text(license_text) | |
cleaned_license_sentences = re.split('(\n{2,}|\.)', cleaned_license_text) | |
cleaned_license_sentences = [ | |
text.strip() for text in cleaned_license_sentences | |
if text.strip() not in ["", ".", "\n", "\n\n"] | |
] | |
summary_len = math.ceil(summary_len*len(cleaned_license_sentences)) | |
if debug: | |
print(f"summary length:{summary_len}") | |
print(cleaned_license_sentences) | |
for sent_i, sent in enumerate(cleaned_license_sentences): | |
if sent_i < 0: | |
continue | |
if len(sent.split()) < min_sent_len: | |
continue | |
score = 0 | |
lemmatized_tokens = lemmatize_tokens(sent) | |
if debug: | |
print("-"*50) | |
print(f"\nOriginal Sentence = {sent}") | |
print(f"\n{sent_i}. Lemmatized_tokens = {lemmatized_tokens}") | |
word_count = Counter([tok for tok in lemmatized_tokens]) | |
for prop, prop_words in properties_dict.items(): | |
prop_score = 0 | |
imp_words = list() | |
for prop_i, prop_word in enumerate(prop_words): | |
if prop_word in word_count.keys(): | |
prop_score += properties_scores[prop] | |
imp_words.append(prop_word) | |
if debug: | |
print(prop, "=", imp_words, "=", prop_score) | |
score += prop_score | |
sent_scores[sent] = score / len(lemmatized_tokens) | |
if debug: | |
print(f"Sentence score: {sent_scores[sent]}") | |
print() | |
if debug: | |
print(sent_scores) | |
sorted_sent_scores = sent_scores.most_common() | |
summary = ".\n".join(sent for sent, score in sorted_sent_scores[:summary_len]) | |
return summary, definitions |