File size: 2,175 Bytes
a804ced
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1fdb52f
 
 
 
 
 
 
a804ced
 
 
1fdb52f
a804ced
 
 
 
 
 
 
1fdb52f
a804ced
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import nltk
import numpy as np
import gensim
import spacy
import math
from collections import Counter

from src.clean import clean_license_text
from src.read_data import read_file

properties_dict = {
    "modify":['modify', 'modification', 'change'],
    "distribute":['distribute', 'distribution'],
    "copy":['copy'],
    "copyright": ['copyright']
#     "exception"
}

properties_scores = {
    "modify": 0.8,
    "distribute": 0.8,
    "copy": 0.8,
    "copyright": 0.9 
}

nlp = spacy.load('en_core_web_sm')

def lemmatize_tokens(sent):
    #TODO: Docstrings
    '''each word in input sentence is converted to lemma'''
    return [token.lemma_.lower() for token in nlp(sent)]


def custom_textrank_summarizer(license_text, min_sent_len=2, summary_len=0.3, debug=False):
    '''
    TODO: Doctrings
    '''
    sent_scores = {}
    cleaned_license_text, definitions = clean_license_text(license_text)
    cleaned_license_sentences = cleaned_license_text.split('.')
    summary_len = math.ceil(summary_len*len(cleaned_license_sentences))
    if debug:
        print(f'summary length:{summary_len}')
    if debug:
        print(cleaned_license_sentences)
    for i in cleaned_license_sentences:
        if debug:
            print(i.split())
        if len(i.split()) < min_sent_len:
            continue
        score = 0
        for prop, prop_words in properties_dict.items():
            prop_score = 0
            lemmatized_tokens = lemmatize_tokens(i)
            word_count = Counter([tok for tok in lemmatized_tokens])
            for prop_word in prop_words:
                if prop_word in word_count.keys():
                    prop_score += properties_scores[prop] 
            if debug:
                print(prop, "=", prop_score)
            score += prop_score
        sent_scores[i] = score/len(lemmatized_tokens)
        if debug:
            print(f'Sentence score: {sent_scores[i]}')
            print()
    if debug:
        print(sent_scores)
    sorted_sent_scores = dict(sorted(sent_scores.items(), key=lambda item: item[1], reverse=True))
    summary = '.\n'.join(list(sorted_sent_scores.keys())[:summary_len])
    return summary, definitions