Nihal D'Souza
Custom textrank, changes to UI
a804ced
raw
history blame
2.03 kB
import nltk
import numpy as np
import gensim
import spacy
import math
from collections import Counter
from src.clean import clean_license_text
from src.read_data import read_file
nltk.download('punkt')
properties_dict = {
"modify":['modify', 'modification', 'change'],
"distribute":['distribute', 'distribution'],
"copy":['copy'],
"copyright": ['copyright']
# "exception"
}
properties_scores = {
"modify": 0.8,
"distribute": 0.8,
"copy": 0.8,
"copyright": 0.9
}
nlp = spacy.load('en_core_web_sm')
def lemmatize_tokens(sent):
#TODO: Docstrings
'''each word in input sentence is converted to lemma'''
return [token.lemma_.lower() for token in nlp(sent)]
def custom_textrank_summarizer(license_text, min_sent_len=2, summary_len=0.3, debug=False):
'''
TODO: Doctrings
'''
summary_len = math.ceil(summary_len*len(license_text.split('.')))
sent_scores = {}
cleaned_license_text, definitions = clean_license_text(license_text)
for i in cleaned_license_text.split('.'):
if debug:
print(i.split())
if len(i.split()) < min_sent_len:
break
score = 0
for prop, prop_words in properties_dict.items():
prop_score = 0
lemmatized_tokens = lemmatize_tokens(i)
word_count = Counter([tok for tok in lemmatized_tokens])
for prop_word in prop_words:
if prop_word in word_count.keys():
prop_score += properties_scores[prop_word]
if debug:
print(prop, "=", prop_score)
score += prop_score
sent_scores[i] = score/len(lemmatized_tokens)
if debug:
print(f'Sentence score: {sent_scores[i]}')
print()
if debug:
print(sent_scores)
sorted_sent_scores = dict(sorted(sent_scores.items(), key=lambda item: item[1], reverse=True))
summary = '.\n'.join(list(sorted_sent_scores.keys())[:summary_len])
return summary, definitions