File size: 14,725 Bytes
e41b03f
a804ced
 
 
 
e41b03f
ac750db
 
e41b03f
ac750db
 
e41b03f
ac750db
 
e41b03f
 
 
 
ac750db
 
 
e41b03f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a804ced
 
 
e41b03f
 
 
 
 
 
 
 
 
 
 
 
 
 
ac750db
 
 
a804ced
ac750db
 
e41b03f
 
 
ac750db
e41b03f
 
 
ac750db
 
 
a804ced
e41b03f
ac750db
 
e41b03f
 
 
 
 
 
 
ac750db
e41b03f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac750db
 
e41b03f
 
 
 
 
ac750db
e41b03f
ac750db
e41b03f
ac750db
1fdb52f
ac750db
 
 
 
 
 
 
a804ced
ac750db
 
 
 
 
 
 
 
 
 
e41b03f
a804ced
ac750db
 
 
e41b03f
a804ced
e41b03f
ac750db
 
a804ced
ac750db
 
a804ced
ac750db
e41b03f
 
 
 
 
ac750db
a804ced
e41b03f
a804ced
ac750db
e41b03f
 
a804ced
 
 
e41b03f
ac750db
a804ced
e41b03f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
import pandas as pd
import spacy
import math
from collections import Counter


try:
    from src.clean import clean_license_text
    from src.parameters import color, vocab
except:
    from clean import clean_license_text
    from parameters import color, vocab


GOLD_STANDARD_PATH = "../UBC-SAP_gold-corpus/UBC-SAP_capstone_corpus_labels_removed.xlsx"
LABELS_PATH = "data/choosealicense_appendix_labels.csv"
MIN_SENT_LEN = 3
SUMMARY_LEN = 0.3

nlp = spacy.load("en_core_web_sm")


def normalize_sentence_counter(counter):
    """
    Normalize sentence scores in the counter between 0 and 3

    Parameters
    ----------
    counter : dict
        A dictionary of scores with keys as sentence and values as raw scores.

    Returns
    -------
    counter : dict
        A dictionary of scores with keys as sentence and values as normalized
        scores.

    """
    vals = list(counter.values())

    if vals:
        min_val = min(vals)
        max_val = max(vals)
    else:
        return counter

    for sent in counter:
        try:
            counter[sent] = round(3 * (counter[sent] - min_val) / (max_val - min_val), 3)
        except:
            counter[sent] = 0
    return counter


def sent_tokenize_text(text, debug=False):
    """
    Tokenize a license text into sentences

    Parameters
    ----------
    text : str
        License text to be tokenized into sentences.
    debug : bool, optional
        Toggles debug mode. The default is False.

    Returns
    -------
    tokenized_sents : list
        A list of tokenized sentences.

    """
    tokenized_sents = list()
    paras = text.split("\n\n")
    for para in paras:
        for sent in nlp(para).sents:
            sent = sent.text.replace("\n", "").strip()
            if tokenized_sents and len(tokenized_sents[-1]) <= 30:
                tokenized_sents[-1] += f" {sent.strip()}"
            else:
                tokenized_sents.append(sent.strip())
        try:
            tokenized_sents[-1] += "\n\n"
        except:
            pass
    if debug:
        print("Segmented Sentences:")
        print("="*20)
        for i, sent in enumerate(tokenized_sents):
            print(f"Sent {i+1}")
            print("-"*20)
            print(sent)
            print("-"*50)
            print()
    return tokenized_sents


def lemmatize_tokens(sent):
    """
    Lemmatize tokens into the given sentence

    Parameters
    ----------
    sent : str
        A sentences whose tokens are to be lemmatized.

    Returns
    -------
    list
        A list of lemmatized tokens.

    """
    lemmas = list()

    nlp_sent = [token.lemma_.lower().strip() for token in nlp(sent)]

    for tok_i, token in enumerate(nlp_sent):
        if (token
            and token not in vocab.license_stopwords
            and token not in vocab.negation_words):
            if tok_i > 0 and nlp_sent[tok_i-1] in vocab.negation_words:
                lemmas.append(f"{nlp_sent[tok_i-1]}-{token}")
            elif (tok_i > 1
                  and nlp_sent[tok_i-1] in " -"
                  and nlp_sent[tok_i-2] in vocab.negation_words):
                lemmas.append(f"{nlp_sent[tok_i-2]}-{token}")
            else:
                lemmas.append(token)

    return [lemma for lemma in lemmas if len(lemma) > 2]


def get_license_summary_scores(license_text,
                               min_sent_len=MIN_SENT_LEN,
                               summary_len=SUMMARY_LEN,
                               summary_in_text_order=True,
                               return_summary_only=True,
                               debug=False,
                               cleaned_license_sentences=None):
    """
    Get sentence scores for all the cleaned sentences in a given license_text
    along with other extracted details such as definitions, exceptions, etc.
    and the cleaned license text itself.

    Parameters
    ----------
    license_text : str
        License text.
    min_sent_len : int, optional
        The minimum number of tokens in a sentence for it to be considered.
        The default is 3.
    summary_len : float, optional
        The proportion of length of the expected summary to the length of
        license text. The default is 0.3.
    summary_in_text_order : bool, optional
        Toggle to switch between summary in text order or in descending order
        by scores. The default is True.
    return_summary_only : bool, optional
        Toggle to return just the summary or entire license text with
        important sentences highlighted. The default is True.
    debug : bool, optional
        Toggles debug mode. The default is False.
    cleaned_license_sentences : list, optional
        A list of cleaned sentences. The default is None.

    Returns
    -------
    sent_scores : dict
        A dictionary of sentence scores with keys as tuples of sentence and
        sentence id and values as their normalized scores.
    cleaned_license_sentences : list
        A list of cleaned sentences.
    definitions : str
        Definitions extracted from license text.
    exceptions : str
        Exceptions extracted from license text.
    summary_len : float
        The proportion of length of the expected summary to the length of
        license text.

    """

    if not cleaned_license_sentences:
        cleaned_license_text, definitions, exceptions = clean_license_text(license_text)
        cleaned_license_sentences = sent_tokenize_text(cleaned_license_text, debug)
    else:
        definitions, exceptions = "", ""

    sent_scores = Counter()

    summary_len = math.ceil(summary_len * len(cleaned_license_sentences))

    if debug:
        print(f"summary length:{summary_len}")

    for sent_i, sent in enumerate(cleaned_license_sentences):

        if len(sent.split()) < min_sent_len:
            continue

        score = 0

        lemmatized_tokens = lemmatize_tokens(sent)

        if debug:
            print("-"*50)
            print(f"\nOriginal Sentence = {sent}")
            print(f"\n{sent_i}. Lemmatized_tokens = {lemmatized_tokens}")

        word_count = Counter([tok for tok in lemmatized_tokens])

        for prop, prop_words in vocab.properties_dict.items():
            prop_score = 0

            imp_words = list()

            for prop_word in prop_words:
                if prop_word in word_count.keys():
                    prop_score += vocab.properties_scores[prop] 
                    imp_words.append(prop_word)

            if debug:
                print(prop, "=", imp_words, "=", prop_score)

            score += prop_score

        # With normalization
        # sent_scores[(sent, sent_i)] = score  / len(lemmatized_tokens)

        # Without normalization
        sent_scores[(sent, sent_i)] = score

        if debug:
            print(f"Sentence score: {sent_scores[(sent, sent_i)]}")
            print()

    sent_scores = normalize_sentence_counter(sent_scores)

    if debug:
        print(sent_scores)

    return sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len


def get_sent_scores(license_text,
                    min_sent_len=MIN_SENT_LEN,
                    summary_len=SUMMARY_LEN,
                    summary_in_text_order=True,
                    return_summary_only=True,
                    debug=False,
                    cleaned_license_sentences=None):
    """
    Get sentence scores for all the sentences in a given license_text along
    with their sentence ids.

    Parameters
    ----------
    license_text : str
        License text.
    min_sent_len : int, optional
        The minimum number of tokens in a sentence for it to be considered.
        The default is 3.
    summary_len : float, optional
        The proportion of length of the expected summary to the length of
        license text. The default is 0.3.
    summary_in_text_order : bool, optional
        Toggle to switch between summary in text order or in descending order
        by scores. The default is True.
    return_summary_only : bool, optional
        Toggle to return just the summary or entire license text with
        important sentences highlighted. The default is True.
    debug : bool, optional
        Toggles debug mode. The default is False.
    cleaned_license_sentences : list, optional
        A list of cleaned sentences. The default is None.

    Returns
    -------
    sent_id_scores : list(tuple)
        A list of tuples of sentence id and sentence score.

    """
    sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len = get_license_summary_scores(
        license_text,
        min_sent_len=min_sent_len,
        summary_len=summary_len,
        summary_in_text_order=summary_in_text_order,
        return_summary_only=return_summary_only,
        debug=debug,
        cleaned_license_sentences=cleaned_license_sentences
    )

    sent_id_scores = [
        (sent_i, score) for (sent_id, sent_i), score in sent_scores.items()
    ]

    return sent_id_scores


def custom_textrank_summarizer(license_text,
                               min_sent_len=MIN_SENT_LEN,
                               summary_len=SUMMARY_LEN,
                               summary_in_text_order=True,
                               return_summary_only=True,
                               debug=False):
    """
    Returns summary / highlighted summary, definitions and exceptions for a
    given license_text.

    Parameters
    ----------
    license_text : str
        License text.
    min_sent_len : int, optional
        The minimum number of tokens in a sentence for it to be considered.
        The default is 3.
    summary_len : float, optional
        The proportion of length of the expected summary to the length of
        license text. The default is 0.3.
    summary_in_text_order : bool, optional
        Toggle to switch between summary in text order or in descending order
        by scores. The default is True.
    return_summary_only : bool, optional
        Toggle to return just the summary or entire license text with
        important sentences highlighted. The default is True.
    debug : bool, optional
        Toggles debug mode. The default is False.

    Returns
    -------
    str
        Summary or the highlighted license text.
    definitions : str
        Definitions extracted from license text.
    exceptions : str
        Exceptions extracted from license text.

    """

    sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len = get_license_summary_scores(
        license_text,
        min_sent_len=min_sent_len,
        summary_len=summary_len,
        summary_in_text_order=summary_in_text_order,
        return_summary_only=return_summary_only,
        debug=debug
    )

    sorted_sent_scores = sent_scores.most_common()[:summary_len]

    if summary_in_text_order:
        sentences_in_text_order = sorted(sorted_sent_scores, key=lambda x: x[0][1])
        summary = "".join(sent.strip(". ") for (sent, sent_i), score in sentences_in_text_order)
        selected_sent_ids = set(sent_i for (_, sent_i), score in sentences_in_text_order)
    else:
        summary = "".join(sent.strip(". ") for (sent, sent_i), score in sorted_sent_scores)
        selected_sent_ids = set(sent_i for (_, sent_i), score in sorted_sent_scores)

    highlighted_license_text = " ".join(
        f"""<mark style="color: {color.BLACK}; background-color:{color.GREEN}">{sent}</mark>"""
        if sent_i in selected_sent_ids
        else sent
        for sent_i, sent in enumerate(cleaned_license_sentences)
    )

    if debug:
        print("="*50)
        print("License Text:")
        print("-"*30)
        print(highlighted_license_text)
        print("="*50)

    definitions = definitions.strip("\n.") + "."

    if return_summary_only:
        return summary, definitions, exceptions
    else:
        return highlighted_license_text, definitions, exceptions


def get_system_scores(attachment_id=None):
    """
    Get system sentence scores for all the sentences in all licenses in gold
    standard.

    Parameters
    ----------
    attachment_id : str, optional
        The attachment id of the document for which the sentence scores are to
        be calculated. If None, the sentence scores for all the documents will
        be returned. The default is None.

    Returns
    -------
    scores_dict : dict
        A dictionary of all the scores with keys as the attachment id of a
        document and values as a list of tuples of sentence id and scores for
        that attachment id.

    """
    gold_data = pd.read_excel(GOLD_STANDARD_PATH)
    gold_data = gold_data[["attachment_id", "sentence"]]
    sent_lists = gold_data.groupby("attachment_id")["sentence"].apply(list)

    scores_dict = dict()

    if attachment_id:
        scores_dict[attachment_id] = get_sent_scores(
            "",
            summary_len=SUMMARY_LEN,
            cleaned_license_sentences=sent_lists[attachment_id]
        )
        return scores_dict

    for attachment_id, cleaned_license_sentences in dict(sent_lists).items():
        
        scores_dict[attachment_id] = get_sent_scores(
            "",
            summary_len=SUMMARY_LEN,
            cleaned_license_sentences=cleaned_license_sentences
        )

    return scores_dict


def preprocess_properties(cell):
    """
    Converts licnse properties to title case and removes hyphens and
    underscores.

    Parameters
    ----------
    cell : str
        A cell string in properties dataframe of a license.

    Returns
    -------
    cell : TYPE
        DESCRIPTION.

    """
    try:
        cell = cell.replace("--", "$")
        cell = cell.replace("-", " ")
        cell = cell.replace("_", " ")
        cell = cell.replace("$", " - ").title()
    except:
        pass
    return cell

def get_labels_for_license(license_id, by_license_id=True):
    """
    Gets license properties for a given license_id.

    Parameters
    ----------
    license_id : str
        License id of the license for which properties are to be returned.
    by_license_id : bool, optional
        A flag to decide whether we fetch the license properties by license id
        or license name. The default is True.

    Returns
    -------
    properties : pandas.DataFrame
        Dataframe with properties of the license with id license_id.

    """
    index_col = 0 if by_license_id else 1
    columns = ["Property", "Label"]
    labels_data = pd.read_csv(LABELS_PATH, index_col=index_col)
    properties = pd.DataFrame(labels_data.loc[license_id]).reset_index()
    properties.columns = columns
    properties = properties.applymap(preprocess_properties)
    return properties