Spaces:
Runtime error
Runtime error
import nltk | |
import numpy as np | |
import gensim | |
import spacy | |
import math | |
from collections import Counter | |
from src.clean import clean_license_text | |
from src.read_data import read_file | |
nltk.download('punkt') | |
properties_dict = { | |
"modify":['modify', 'modification', 'change'], | |
"distribute":['distribute', 'distribution'], | |
"copy":['copy'], | |
"copyright": ['copyright'] | |
# "exception" | |
} | |
properties_scores = { | |
"modify": 0.8, | |
"distribute": 0.8, | |
"copy": 0.8, | |
"copyright": 0.9 | |
} | |
nlp = spacy.load('en_core_web_sm') | |
def lemmatize_tokens(sent): | |
#TODO: Docstrings | |
'''each word in input sentence is converted to lemma''' | |
return [token.lemma_.lower() for token in nlp(sent)] | |
def custom_textrank_summarizer(license_text, min_sent_len=2, summary_len=0.3, debug=False): | |
''' | |
TODO: Doctrings | |
''' | |
summary_len = math.ceil(summary_len*len(license_text.split('.'))) | |
sent_scores = {} | |
cleaned_license_text, definitions = clean_license_text(license_text) | |
for i in cleaned_license_text.split('.'): | |
if debug: | |
print(i.split()) | |
if len(i.split()) < min_sent_len: | |
break | |
score = 0 | |
for prop, prop_words in properties_dict.items(): | |
prop_score = 0 | |
lemmatized_tokens = lemmatize_tokens(i) | |
word_count = Counter([tok for tok in lemmatized_tokens]) | |
for prop_word in prop_words: | |
if prop_word in word_count.keys(): | |
prop_score += properties_scores[prop_word] | |
if debug: | |
print(prop, "=", prop_score) | |
score += prop_score | |
sent_scores[i] = score/len(lemmatized_tokens) | |
if debug: | |
print(f'Sentence score: {sent_scores[i]}') | |
print() | |
if debug: | |
print(sent_scores) | |
sorted_sent_scores = dict(sorted(sent_scores.items(), key=lambda item: item[1], reverse=True)) | |
summary = '.\n'.join(list(sorted_sent_scores.keys())[:summary_len]) | |
return summary, definitions | |