Spaces:

nihaldsouza1
/

clearlydefined_license_summarizer

Runtime error

clearlydefined_license_summarizer / src /textrank.py

Nihal D'Souza

Custom textrank, changes to UI

a804ced about 3 years ago

2.03 kB

	import nltk
	import numpy as np
	import gensim
	import spacy
	import math
	from collections import Counter

	from src.clean import clean_license_text
	from src.read_data import read_file

	nltk.download('punkt')

	properties_dict = {
	"modify":['modify', 'modification', 'change'],
	"distribute":['distribute', 'distribution'],
	"copy":['copy'],
	"copyright": ['copyright']
	# "exception"
	}

	properties_scores = {
	"modify": 0.8,
	"distribute": 0.8,
	"copy": 0.8,
	"copyright": 0.9
	}

	nlp = spacy.load('en_core_web_sm')

	def lemmatize_tokens(sent):
	#TODO: Docstrings
	'''each word in input sentence is converted to lemma'''
	return [token.lemma_.lower() for token in nlp(sent)]


	def custom_textrank_summarizer(license_text, min_sent_len=2, summary_len=0.3, debug=False):
	'''
	TODO: Doctrings
	'''
	summary_len = math.ceil(summary_len*len(license_text.split('.')))
	sent_scores = {}
	cleaned_license_text, definitions = clean_license_text(license_text)
	for i in cleaned_license_text.split('.'):
	if debug:
	print(i.split())
	if len(i.split()) < min_sent_len:
	break
	score = 0
	for prop, prop_words in properties_dict.items():
	prop_score = 0
	lemmatized_tokens = lemmatize_tokens(i)
	word_count = Counter([tok for tok in lemmatized_tokens])
	for prop_word in prop_words:
	if prop_word in word_count.keys():
	prop_score += properties_scores[prop_word]
	if debug:
	print(prop, "=", prop_score)
	score += prop_score
	sent_scores[i] = score/len(lemmatized_tokens)
	if debug:
	print(f'Sentence score: {sent_scores[i]}')
	print()
	if debug:
	print(sent_scores)
	sorted_sent_scores = dict(sorted(sent_scores.items(), key=lambda item: item[1], reverse=True))
	summary = '.\n'.join(list(sorted_sent_scores.keys())[:summary_len])
	return summary, definitions