Spaces:

nihaldsouza1
/

clearlydefined_license_summarizer

Runtime error

clearlydefined_license_summarizer / src /textrank.py

Nihal D'Souza

Pushing latest development branch

ac750db about 3 years ago

5.27 kB

	import re
	import nltk
	import numpy as np
	import gensim
	import spacy
	import math
	from collections import Counter

	try:
	from src.clean import clean_license_text
	from src.read_data import read_file
	except:
	from clean import clean_license_text
	from read_data import read_file


	NEGATION_WEIGHT = 0.2

	nlp = spacy.load("en_core_web_sm")

	modal_verbs = {
	"can",
	"may",
	"must",
	"shall",
	"will",
	# "could",
	# "might",
	"should",
	"would"
	}

	neg_modal = {
	"cannot",
	"may not",
	"must not",
	"shall not",
	"will not",
	# "could not",
	# "might not",
	"should not",
	"would not"
	}

	# TODO Move these structures to another file
	license_stopwords = {
	",",
	"(",
	")",
	"software",
	"license",
	"work",
	# "copyright",
	"program",
	# "use",
	# "copy",
	"source",
	# "may",
	# "terms",
	"code",
	# "without",
	# "free",
	# "distribute",
	# "rights",
	# "notice",
	# "shall",
	"provided",
	# "permission",
	# "including",
	"version",
	"library",
	# "condition",
	"covered",
	# "must",
	"public",
	# "modify",
	# "distribution",
	# "warranty",
	}.union(nlp.Defaults.stop_words) - modal_verbs

	negation_words = {
	"no",
	"not",
	"non"
	}

	# TODO: Consider adding these words to the vocab:
	# no-charge
	#
	#
	#
	#

	verbs = [
	"permit", "copy", "modify", "change", "sell", "reproduce",
	"transfer", "rent", "lease", "assign", "sublet", "distribute",
	"redistribute", "allow", "require", "merge", "publish", "use",
	"include", "grant", "run", "affirm", "propagate", "acknowledge"
	]

	neg_verbs = [f"not-{verb}" for verb in verbs]

	properties_dict = {
	"0.1": [
	],
	"0.2": ["everyone"],
	"0.3": ["irrevocable"],
	"0.4": [],
	"0.5": [],
	"0.6": [
	"distribution", "redistribution",
	"permission", "modification",
	"copyright",
	"permission",
	"limitation",
	"free", "charge",
	"warranty",
	"term", "terms", "condition",
	"right",
	"sublicense",
	"commercial", "non-commercial",
	"exception"
	],
	"0.7": verbs + [

	],
	"0.8": [],
	"0.9": neg_verbs + [],
	"1.0": [],
	"3.0": modal_verbs
	}


	properties_scores = {
	"0.1": 0.1,
	"0.2": 0.2,
	"0.3": 0.3,
	"0.4": 0.4,
	"0.5": 0.5,
	"0.6": 0.6,
	"0.7": 0.7,
	"0.8": 0.8,
	"0.9": 0.9,
	"1.0": 1.0,
	"3.0": 3.0
	}


	def lemmatize_tokens(sent):
	# TODO: Docstrings
	"""Each word in input sentence is converted to lemma"""
	lemmas = list()

	nlp_sent = [token.lemma_.lower().strip() for token in nlp(sent)]

	for tok_i, token in enumerate(nlp_sent):
	if (token
	and token not in license_stopwords
	and token not in negation_words):
	if tok_i > 0 and nlp_sent[tok_i-1] in negation_words:
	lemmas.append(f"{nlp_sent[tok_i-1]}-{token}")
	elif tok_i > 1 and nlp_sent[tok_i-1] in " -" and nlp_sent[tok_i-2] in negation_words:
	lemmas.append(f"{nlp_sent[tok_i-2]}-{token}")
	else:
	lemmas.append(token)

	return lemmas


	def custom_textrank_summarizer(license_text,
	min_sent_len=3,
	summary_len=0.3,
	debug=False):
	"""
	TODO: Doctrings
	"""
	sent_scores = Counter()

	cleaned_license_text, definitions = clean_license_text(license_text)

	cleaned_license_sentences = re.split('(\n{2,}\|\.)', cleaned_license_text)
	cleaned_license_sentences = [
	text.strip() for text in cleaned_license_sentences
	if text.strip() not in ["", ".", "\n", "\n\n"]
	]

	summary_len = math.ceil(summary_len*len(cleaned_license_sentences))

	if debug:
	print(f"summary length:{summary_len}")
	print(cleaned_license_sentences)

	for sent_i, sent in enumerate(cleaned_license_sentences):

	if sent_i < 0:
	continue

	if len(sent.split()) < min_sent_len:
	continue

	score = 0

	lemmatized_tokens = lemmatize_tokens(sent)

	if debug:
	print("-"*50)
	print(f"\nOriginal Sentence = {sent}")
	print(f"\n{sent_i}. Lemmatized_tokens = {lemmatized_tokens}")

	word_count = Counter([tok for tok in lemmatized_tokens])

	for prop, prop_words in properties_dict.items():
	prop_score = 0

	imp_words = list()

	for prop_i, prop_word in enumerate(prop_words):
	if prop_word in word_count.keys():
	prop_score += properties_scores[prop]
	imp_words.append(prop_word)

	if debug:
	print(prop, "=", imp_words, "=", prop_score)

	score += prop_score

	sent_scores[sent] = score / len(lemmatized_tokens)

	if debug:
	print(f"Sentence score: {sent_scores[sent]}")
	print()

	if debug:
	print(sent_scores)

	sorted_sent_scores = sent_scores.most_common()

	summary = ".\n".join(sent for sent, score in sorted_sent_scores[:summary_len])

	return summary, definitions