Spaces:
Runtime error
Runtime error
File size: 2,175 Bytes
a804ced 1fdb52f a804ced 1fdb52f a804ced 1fdb52f a804ced |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import nltk
import numpy as np
import gensim
import spacy
import math
from collections import Counter
from src.clean import clean_license_text
from src.read_data import read_file
properties_dict = {
"modify":['modify', 'modification', 'change'],
"distribute":['distribute', 'distribution'],
"copy":['copy'],
"copyright": ['copyright']
# "exception"
}
properties_scores = {
"modify": 0.8,
"distribute": 0.8,
"copy": 0.8,
"copyright": 0.9
}
nlp = spacy.load('en_core_web_sm')
def lemmatize_tokens(sent):
#TODO: Docstrings
'''each word in input sentence is converted to lemma'''
return [token.lemma_.lower() for token in nlp(sent)]
def custom_textrank_summarizer(license_text, min_sent_len=2, summary_len=0.3, debug=False):
'''
TODO: Doctrings
'''
sent_scores = {}
cleaned_license_text, definitions = clean_license_text(license_text)
cleaned_license_sentences = cleaned_license_text.split('.')
summary_len = math.ceil(summary_len*len(cleaned_license_sentences))
if debug:
print(f'summary length:{summary_len}')
if debug:
print(cleaned_license_sentences)
for i in cleaned_license_sentences:
if debug:
print(i.split())
if len(i.split()) < min_sent_len:
continue
score = 0
for prop, prop_words in properties_dict.items():
prop_score = 0
lemmatized_tokens = lemmatize_tokens(i)
word_count = Counter([tok for tok in lemmatized_tokens])
for prop_word in prop_words:
if prop_word in word_count.keys():
prop_score += properties_scores[prop]
if debug:
print(prop, "=", prop_score)
score += prop_score
sent_scores[i] = score/len(lemmatized_tokens)
if debug:
print(f'Sentence score: {sent_scores[i]}')
print()
if debug:
print(sent_scores)
sorted_sent_scores = dict(sorted(sent_scores.items(), key=lambda item: item[1], reverse=True))
summary = '.\n'.join(list(sorted_sent_scores.keys())[:summary_len])
return summary, definitions
|