Spaces:

nihaldsouza1
/

clearlydefined_license_summarizer

Runtime error

File size: 14,725 Bytes

e41b03f
a804ced
 
 
 
e41b03f
ac750db
 
e41b03f
ac750db
 
e41b03f
ac750db
 
e41b03f
 
 
 
ac750db
 
 
e41b03f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a804ced
 
 
e41b03f
 
 
 
 
 
 
 
 
 
 
 
 
 
ac750db
 
 
a804ced
ac750db
 
e41b03f
 
 
ac750db
e41b03f
 
 
ac750db
 
 
a804ced
e41b03f
ac750db
 
e41b03f
 
 
 
 
 
 
ac750db
e41b03f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac750db
 
e41b03f
 
 
 
 
ac750db
e41b03f
ac750db
e41b03f
ac750db
1fdb52f
ac750db
 
 
 
 
 
 
a804ced
ac750db
 
 
 
 
 
 
 
 
 
e41b03f
a804ced
ac750db
 
 
e41b03f
a804ced
e41b03f
ac750db
 
a804ced
ac750db
 
a804ced
ac750db
e41b03f
 
 
 
 
ac750db
a804ced
e41b03f
a804ced
ac750db
e41b03f
 
a804ced
 
 
e41b03f
ac750db
a804ced
e41b03f

import pandas as pd
import spacy
import math
from collections import Counter


try:
    from src.clean import clean_license_text
    from src.parameters import color, vocab
except:
    from clean import clean_license_text
    from parameters import color, vocab


GOLD_STANDARD_PATH = "../UBC-SAP_gold-corpus/UBC-SAP_capstone_corpus_labels_removed.xlsx"
LABELS_PATH = "data/choosealicense_appendix_labels.csv"
MIN_SENT_LEN = 3
SUMMARY_LEN = 0.3

nlp = spacy.load("en_core_web_sm")


def normalize_sentence_counter(counter):
    """
    Normalize sentence scores in the counter between 0 and 3

    Parameters
    ----------
    counter : dict
        A dictionary of scores with keys as sentence and values as raw scores.

    Returns
    -------
    counter : dict
        A dictionary of scores with keys as sentence and values as normalized
        scores.

    """
    vals = list(counter.values())

    if vals:
        min_val = min(vals)
        max_val = max(vals)
    else:
        return counter

    for sent in counter:
        try:
            counter[sent] = round(3 * (counter[sent] - min_val) / (max_val - min_val), 3)
        except:
            counter[sent] = 0
    return counter


def sent_tokenize_text(text, debug=False):
    """
    Tokenize a license text into sentences

    Parameters
    ----------
    text : str
        License text to be tokenized into sentences.
    debug : bool, optional
        Toggles debug mode. The default is False.

    Returns
    -------
    tokenized_sents : list
        A list of tokenized sentences.

    """
    tokenized_sents = list()
    paras = text.split("\n\n")
    for para in paras:
        for sent in nlp(para).sents:
            sent = sent.text.replace("\n", "").strip()
            if tokenized_sents and len(tokenized_sents[-1]) <= 30:
                tokenized_sents[-1] += f" {sent.strip()}"
            else:
                tokenized_sents.append(sent.strip())
        try:
            tokenized_sents[-1] += "\n\n"
        except:
            pass
    if debug:
        print("Segmented Sentences:")
        print("="*20)
        for i, sent in enumerate(tokenized_sents):
            print(f"Sent {i+1}")
            print("-"*20)
            print(sent)
            print("-"*50)
            print()
    return tokenized_sents


def lemmatize_tokens(sent):
    """
    Lemmatize tokens into the given sentence

    Parameters
    ----------
    sent : str
        A sentences whose tokens are to be lemmatized.

    Returns
    -------
    list
        A list of lemmatized tokens.

    """
    lemmas = list()

    nlp_sent = [token.lemma_.lower().strip() for token in nlp(sent)]

    for tok_i, token in enumerate(nlp_sent):
        if (token
            and token not in vocab.license_stopwords
            and token not in vocab.negation_words):
            if tok_i > 0 and nlp_sent[tok_i-1] in vocab.negation_words:
                lemmas.append(f"{nlp_sent[tok_i-1]}-{token}")
            elif (tok_i > 1
                  and nlp_sent[tok_i-1] in " -"
                  and nlp_sent[tok_i-2] in vocab.negation_words):
                lemmas.append(f"{nlp_sent[tok_i-2]}-{token}")
            else:
                lemmas.append(token)

    return [lemma for lemma in lemmas if len(lemma) > 2]


def get_license_summary_scores(license_text,
                               min_sent_len=MIN_SENT_LEN,
                               summary_len=SUMMARY_LEN,
                               summary_in_text_order=True,
                               return_summary_only=True,
                               debug=False,
                               cleaned_license_sentences=None):
    """
    Get sentence scores for all the cleaned sentences in a given license_text
    along with other extracted details such as definitions, exceptions, etc.
    and the cleaned license text itself.

    Parameters
    ----------
    license_text : str
        License text.
    min_sent_len : int, optional
        The minimum number of tokens in a sentence for it to be considered.
        The default is 3.
    summary_len : float, optional
        The proportion of length of the expected summary to the length of
        license text. The default is 0.3.
    summary_in_text_order : bool, optional
        Toggle to switch between summary in text order or in descending order
        by scores. The default is True.
    return_summary_only : bool, optional
        Toggle to return just the summary or entire license text with
        important sentences highlighted. The default is True.
    debug : bool, optional
        Toggles debug mode. The default is False.
    cleaned_license_sentences : list, optional
        A list of cleaned sentences. The default is None.

    Returns
    -------
    sent_scores : dict
        A dictionary of sentence scores with keys as tuples of sentence and
        sentence id and values as their normalized scores.
    cleaned_license_sentences : list
        A list of cleaned sentences.
    definitions : str
        Definitions extracted from license text.
    exceptions : str
        Exceptions extracted from license text.
    summary_len : float
        The proportion of length of the expected summary to the length of
        license text.

    """

    if not cleaned_license_sentences:
        cleaned_license_text, definitions, exceptions = clean_license_text(license_text)
        cleaned_license_sentences = sent_tokenize_text(cleaned_license_text, debug)
    else:
        definitions, exceptions = "", ""

    sent_scores = Counter()

    summary_len = math.ceil(summary_len * len(cleaned_license_sentences))

    if debug:
        print(f"summary length:{summary_len}")

    for sent_i, sent in enumerate(cleaned_license_sentences):

        if len(sent.split()) < min_sent_len:
            continue

        score = 0

        lemmatized_tokens = lemmatize_tokens(sent)

        if debug:
            print("-"*50)
            print(f"\nOriginal Sentence = {sent}")
            print(f"\n{sent_i}. Lemmatized_tokens = {lemmatized_tokens}")

        word_count = Counter([tok for tok in lemmatized_tokens])

        for prop, prop_words in vocab.properties_dict.items():
            prop_score = 0

            imp_words = list()

            for prop_word in prop_words:
                if prop_word in word_count.keys():
                    prop_score += vocab.properties_scores[prop] 
                    imp_words.append(prop_word)

            if debug:
                print(prop, "=", imp_words, "=", prop_score)

            score += prop_score

        # With normalization
        # sent_scores[(sent, sent_i)] = score  / len(lemmatized_tokens)

        # Without normalization
        sent_scores[(sent, sent_i)] = score

        if debug:
            print(f"Sentence score: {sent_scores[(sent, sent_i)]}")
            print()

    sent_scores = normalize_sentence_counter(sent_scores)

    if debug:
        print(sent_scores)

    return sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len


def get_sent_scores(license_text,
                    min_sent_len=MIN_SENT_LEN,
                    summary_len=SUMMARY_LEN,
                    summary_in_text_order=True,
                    return_summary_only=True,
                    debug=False,
                    cleaned_license_sentences=None):
    """
    Get sentence scores for all the sentences in a given license_text along
    with their sentence ids.

    Parameters
    ----------
    license_text : str
        License text.
    min_sent_len : int, optional
        The minimum number of tokens in a sentence for it to be considered.
        The default is 3.
    summary_len : float, optional
        The proportion of length of the expected summary to the length of
        license text. The default is 0.3.
    summary_in_text_order : bool, optional
        Toggle to switch between summary in text order or in descending order
        by scores. The default is True.
    return_summary_only : bool, optional
        Toggle to return just the summary or entire license text with
        important sentences highlighted. The default is True.
    debug : bool, optional
        Toggles debug mode. The default is False.
    cleaned_license_sentences : list, optional
        A list of cleaned sentences. The default is None.

    Returns
    -------
    sent_id_scores : list(tuple)
        A list of tuples of sentence id and sentence score.

    """
    sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len = get_license_summary_scores(
        license_text,
        min_sent_len=min_sent_len,
        summary_len=summary_len,
        summary_in_text_order=summary_in_text_order,
        return_summary_only=return_summary_only,
        debug=debug,
        cleaned_license_sentences=cleaned_license_sentences
    )

    sent_id_scores = [
        (sent_i, score) for (sent_id, sent_i), score in sent_scores.items()
    ]

    return sent_id_scores


def custom_textrank_summarizer(license_text,
                               min_sent_len=MIN_SENT_LEN,
                               summary_len=SUMMARY_LEN,
                               summary_in_text_order=True,
                               return_summary_only=True,
                               debug=False):
    """
    Returns summary / highlighted summary, definitions and exceptions for a
    given license_text.

    Parameters
    ----------
    license_text : str
        License text.
    min_sent_len : int, optional
        The minimum number of tokens in a sentence for it to be considered.
        The default is 3.
    summary_len : float, optional
        The proportion of length of the expected summary to the length of
        license text. The default is 0.3.
    summary_in_text_order : bool, optional
        Toggle to switch between summary in text order or in descending order
        by scores. The default is True.
    return_summary_only : bool, optional
        Toggle to return just the summary or entire license text with
        important sentences highlighted. The default is True.
    debug : bool, optional
        Toggles debug mode. The default is False.

    Returns
    -------
    str
        Summary or the highlighted license text.
    definitions : str
        Definitions extracted from license text.
    exceptions : str
        Exceptions extracted from license text.

    """

    sent_scores, cleaned_license_sentences, definitions, exceptions, summary_len = get_license_summary_scores(
        license_text,
        min_sent_len=min_sent_len,
        summary_len=summary_len,
        summary_in_text_order=summary_in_text_order,
        return_summary_only=return_summary_only,
        debug=debug
    )

    sorted_sent_scores = sent_scores.most_common()[:summary_len]

    if summary_in_text_order:
        sentences_in_text_order = sorted(sorted_sent_scores, key=lambda x: x[0][1])
        summary = "".join(sent.strip(". ") for (sent, sent_i), score in sentences_in_text_order)
        selected_sent_ids = set(sent_i for (_, sent_i), score in sentences_in_text_order)
    else:
        summary = "".join(sent.strip(". ") for (sent, sent_i), score in sorted_sent_scores)
        selected_sent_ids = set(sent_i for (_, sent_i), score in sorted_sent_scores)

    highlighted_license_text = " ".join(
        f"""<mark style="color: {color.BLACK}; background-color:{color.GREEN}">{sent}</mark>"""
        if sent_i in selected_sent_ids
        else sent
        for sent_i, sent in enumerate(cleaned_license_sentences)
    )

    if debug:
        print("="*50)
        print("License Text:")
        print("-"*30)
        print(highlighted_license_text)
        print("="*50)

    definitions = definitions.strip("\n.") + "."

    if return_summary_only:
        return summary, definitions, exceptions
    else:
        return highlighted_license_text, definitions, exceptions


def get_system_scores(attachment_id=None):
    """
    Get system sentence scores for all the sentences in all licenses in gold
    standard.

    Parameters
    ----------
    attachment_id : str, optional
        The attachment id of the document for which the sentence scores are to
        be calculated. If None, the sentence scores for all the documents will
        be returned. The default is None.

    Returns
    -------
    scores_dict : dict
        A dictionary of all the scores with keys as the attachment id of a
        document and values as a list of tuples of sentence id and scores for
        that attachment id.

    """
    gold_data = pd.read_excel(GOLD_STANDARD_PATH)
    gold_data = gold_data[["attachment_id", "sentence"]]
    sent_lists = gold_data.groupby("attachment_id")["sentence"].apply(list)

    scores_dict = dict()

    if attachment_id:
        scores_dict[attachment_id] = get_sent_scores(
            "",
            summary_len=SUMMARY_LEN,
            cleaned_license_sentences=sent_lists[attachment_id]
        )
        return scores_dict

    for attachment_id, cleaned_license_sentences in dict(sent_lists).items():
        
        scores_dict[attachment_id] = get_sent_scores(
            "",
            summary_len=SUMMARY_LEN,
            cleaned_license_sentences=cleaned_license_sentences
        )

    return scores_dict


def preprocess_properties(cell):
    """
    Converts licnse properties to title case and removes hyphens and
    underscores.

    Parameters
    ----------
    cell : str
        A cell string in properties dataframe of a license.

    Returns
    -------
    cell : TYPE
        DESCRIPTION.

    """
    try:
        cell = cell.replace("--", "$")
        cell = cell.replace("-", " ")
        cell = cell.replace("_", " ")
        cell = cell.replace("$", " - ").title()
    except:
        pass
    return cell

def get_labels_for_license(license_id, by_license_id=True):
    """
    Gets license properties for a given license_id.

    Parameters
    ----------
    license_id : str
        License id of the license for which properties are to be returned.
    by_license_id : bool, optional
        A flag to decide whether we fetch the license properties by license id
        or license name. The default is True.

    Returns
    -------
    properties : pandas.DataFrame
        Dataframe with properties of the license with id license_id.

    """
    index_col = 0 if by_license_id else 1
    columns = ["Property", "Label"]
    labels_data = pd.read_csv(LABELS_PATH, index_col=index_col)
    properties = pd.DataFrame(labels_data.loc[license_id]).reset_index()
    properties.columns = columns
    properties = properties.applymap(preprocess_properties)
    return properties