Spaces:

nihaldsouza1
/

clearlydefined_license_summarizer

Runtime error

File size: 18,857 Bytes

import re
import json
from bs4 import BeautifulSoup
from striprtf.striprtf import rtf_to_text
from collections import defaultdict


PARA_BREAK = "para___break"
seperator = "=" * 50
verbosity = 0


def extract_author_details(text, verbosity=0):
    """
    Extracts important author information from the license text.

    Parameters
    ----------
    text : str
        Raw License text.
    verbosity : int, optional
        The level of print statements on the output console. The default is 0.

    Returns
    -------
    text : str
        License text with author details removed.
    author_details : list
        A list of important author details.

    """
    author_details_pattern = r"(@(author|license|copyright|package).*)"
    author_details = list()
    text = re.sub(author_details_pattern, lambda m: author_details.append(m.group(1)), text)
    if author_details and verbosity != 0:
        print(seperator)
        print(seperator)
        print("Following author details were extracted:")
        print(seperator)
        print(author_details)
        print()
        
    return text, author_details


def php_cleaner(text):
    """
    Cleans the license file in PHP format.

    Parameters
    ----------
    text : str
        Raw License text.

    Returns
    -------
    str
        Cleaned License text with PHP script removed.

    """
    try:
        return re.findall("\/\*[\S\s]*?\*\/", text)[0]
    except:
        return ""
    # return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0]


def html_cleaner(text):
    """
    Cleans the license file in HTML format.

    Parameters
    ----------
    text : str
        Raw License text.

    Returns
    -------
    str
        Cleaned License text with HTML script removed.

    """
    soup = BeautifulSoup(text, features="html.parser")
    text = soup.body.text
    if not text:
        return ""
    return text


def json_cleaner(text_dict):
    """
    Cleans the license file in JSON format.

    Parameters
    ----------
    text_dict : dict
        Dictonary as read from Raw License file.

    Returns
    -------
    text : str
        Cleaned License text with JSON format normalized to text.

    """
    text = ""

    for key in text_dict.keys():
        if key in ("description", "license"):
            text += key
            text += ": "
            text += str(text_dict[key])
            text += ", "

    return text


def rtf_cleaner(text):
    """
    Cleans the license file in RTF format.

    Parameters
    ----------
    text : str
        Raw License text.

    Returns
    -------
    str
        Cleaned License text with RTF script removed.

    """
    return rtf_to_text(text)


def url_cleaner(text):
    """
    Removes URLs from the License text.

    Parameters
    ----------
    text : str
        Raw License text.

    Returns
    -------
    str
        Cleaned License text with URLs removed.

    """
    return re.sub(r"\(?http\S+\)?", "", text)


def email_cleaner(text):
    """
    Removes emails from the License text.

    Parameters
    ----------
    text : str
        Raw License text.

    Returns
    -------
    str
        Cleaned License text with emails removed.

    """
    return re.sub(r"[\w\._-]+@\w{2,}\.\w+", "", text)


def var_cleaner(text):
    """
    Removes potential variable names from the License text.

    Parameters
    ----------
    text : str
        Raw License text.

    Returns
    -------
    str
        Cleaned License text with variable names removed.

    """
    text = re.sub(r"\$\w+", "", text)
    text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text)
    # text = re.sub(r"[a-zA-Z\(\)_'\"]+\.[a-zA-Z_]+", "", text)
    return text


def character_cleaner(text):
    """
    Removes unnecessary special characters from the License text.

    Parameters
    ----------
    text : str
        Raw License text.

    Returns
    -------
    text : str
        Cleaned License text with some special characters removed.

    """
    text = text.replace(PARA_BREAK, f" {PARA_BREAK} ")
    text = url_cleaner(text)
    text = text.replace(f" {PARA_BREAK} ", PARA_BREAK)
    
    text = email_cleaner(text)
    text = var_cleaner(text)

    text = re.sub("\s*(;quot;|&amp)\s*", " ", text)
    text = re.sub("[\n]{2,}", ". ", text)
    text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text)
    text = re.sub("[\. ]{2,}", ". ", text)

    html_strs = [
        "&rsquo;",
        "&ldquo;",
        "&middot;",
        "&plusmn;",
        "&hellip;",
        "&sbquo;",
        "&mdash;",
        "&apos;",
        "&trade;",
        "&Dagger;",
        "&bull;",
        "&laquo;",
        "&prime;",
        "&quot;",
        "&lsquo;",
        "&asymp;",
        "&Prime;",
        "&frac12;",
        "&sect;",
        "&pound;",
        "&cent;",
        "&para;",
        "&raquo;",
        "&dagger;",
        "&rdquo;",
        "&euro;",
        "&copy;",
        "&bdquo;",
        "&ndash;",
        "&deg;",
        "&reg;",
        "&lt;",
        "&gt;",
        "&le;",
        "&ge;",
        "&ne;"
    ]

    for html_str in html_strs:
        text = re.sub(html_str, "", text)
        
    return text


def isEnglish(s):
    """
    Checks whether the License text is in English or not.

    Parameters
    ----------
    s : str
        Raw License text.

    Returns
    -------
    bool
        True if complete License text is in English, False otherwise.

    """
    try:
        s.encode(encoding="utf-8").decode("ascii")
    except UnicodeDecodeError:
        return False
    else:
        return True


def split_definitions_exceptions(text, remove_exceptions, verbosity=0):
    """
    Extract definitions from the License text

    Parameters
    ----------
    text : str
        Raw License text.
    remove_exceptions : bool
        True if we want to remove exceptions from the License text, False
        otherwise
    verbosity : int, optional
        The level of print statements on the output console. The default is 0.

    Returns
    -------
    paras : list
        A list of paragraphs from License text with definitions and exceptions
        removed.
    definitions : str
        Definitions extracted from the License text.
    exceptions : list
        A list of paragraphs which contain exceptions .

    """
    definitions = ""

    if "Definitions" in text:
        try:
            def_pattern = r"([S|s]ection )?[0-9] ?[\.|-|–]? ?([A|a]dditional )?[D|d]efinitions"
            after_def_pattern = r"\s+(Section )?[0-9]\.? [\.|-|–]? ?[A-Z][a-z]+"
            def_pos = re.search(def_pattern, text).span()
            other_start_pos = re.search(after_def_pattern, text[def_pos[1]:]).span()[0]
            definitions = text[def_pos[0]: def_pos[1] + other_start_pos].strip() + "\n\n"
            text = text[:def_pos[0]] + text[def_pos[1] + other_start_pos:]
        except:
            pass

    paras, more_defs = extract_relevant_paras(
        split_paras(text, verbosity=verbosity),
        verbosity=verbosity
    )

    definitions += more_defs.strip()
    definitions = "\n\n".join(split_paras(definitions, verbosity=verbosity))

    paras, exceptions = get_exeptions(paras, remove_exceptions, verbosity=verbosity)

    return paras, definitions, exceptions


def discard_text_after_end_tnc(text):
    """
    Discards text after "END OF TERMS AND CONDITIONS"

    Parameters
    ----------
    text : str
        Raw License text.

    Returns
    -------
    str
        License text with irrelavant information after "END OF TERMS AND CONDITIONS" removed.

    """
    return text.split("END OF TERMS AND CONDITIONS")[0]


def clear_preamble(text):
    """
    Cleans Preamble from the License text

    Parameters
    ----------
    text : str
        Raw License text.

    Returns
    -------
    text : str
        License text with Preamble removed.

    """
    preamble_pattern = "Preamble"
    dist_and_mod_pattern = "distribution\s+and\s+modification\s+follow\.?"

    if preamble_pattern in text:
        preamble_split = text.split(preamble_pattern)

        if len(preamble_split) != 2:
            return text

        try:
            after_preamble_end = re.split(dist_and_mod_pattern, preamble_split[1])[1]

            if len(preamble_split[0]) > 100:
                text = preamble_split[0] + after_preamble_end.strip()
        except:
            pass
    return text


def gnu_cleaner(text):
    """
    Cleans GNU text such as discarding Preamble and text after end of terms
    and conditions.

    Parameters
    ----------
    text : str
        Raw License text.

    Returns
    -------
    preamble_cleared_text : str
        License text with irrelavant information in Preamble and text after end
        of terms and conditions removed.

    """

    before_end_tnc = discard_text_after_end_tnc(text)
    preamble_cleared_text = clear_preamble(before_end_tnc)

    return preamble_cleared_text


def preprocess_text(text):
    """
    Preprocesses License text considering different License types.

    Parameters
    ----------
    text : str
        Raw License text.

    Returns
    -------
    text : str
        License text with irrelavant information in Preamble and text after end
        of terms and conditions removed.

    """

    if "GNU" in text or "Apache" in text:
        text = gnu_cleaner(text)
    return text


def clean_if_else(text):
    """
    Removes specific if-else conditions from the License text

    Parameters
    ----------
    text : str
        Raw License text.

    Returns
    -------
    str
        Cleaned License text with if-else conditions removed.

    """
    return re.sub(r"#\bif[\s\S]+?#endif\s*", "", text).strip()


def clean_comments(text):
    """
    Cleans specific comment formats from the License texts

    Parameters
    ----------
    text : str
        Raw License text.

    Returns
    -------
    str
        Cleaned License text with comments conditions removed.

    """
    return re.sub(r"[\`'\"]{3,}[\s\S]*?[\`'\"]{3,}", "", text).strip()


def script_cleaner(text):
    """
    Cleans the script text from License text to extract the main content.

    Parameters
    ----------
    text : str
        Raw License text.

    Returns
    -------
    str
        Cleaned License text without scripts.

    """
    try:
        if "<?php" in text:
            text = php_cleaner(text)
        elif "</html>" in text:
            text = html_cleaner(text)
        elif "\\rtf" in text:
            text = rtf_cleaner(text)
        elif text[0] == "{" and text[-1] == "}":
            text = json_cleaner(json.loads(text))
    except:
        pass
    if not text:
        return ""

    text = clean_if_else(text)
    text = clean_comments(text)

    return text


def split_paras(text, verbosity=0):
    """
    Splits the text into paragraphs.

    Parameters
    ----------
    text : str
        Raw License text.
    verbosity : int, optional
        The level of print statements on the output console. The default is 0.

    Returns
    -------
    paras : list
        A list of split paragraphs.

    """
    text = re.sub(r"\n{4,}", "\n"*4, text)
    if len(re.findall("\n\n\n\n", text)) >= 2:
        paras = text.split("\n\n\n\n")
        paras = [re.sub(r"\n{1,3}", " ", para) for para in paras]
    elif len(re.findall("\n\n", text)) >= 2:
        paras = text.split("\n\n")
        paras = [re.sub(r"\n", " ", para) for para in paras]
    elif len(re.findall("\n", text)) >= 2:
        paras = text.split("\n")
    else:
        paras = [text]

    paras = [para.strip() for para in paras]

    if verbosity != 0:
        print(seperator)
        print(seperator)
        print("These are the split paras in the text:")
        for para in paras:
            if not para.strip():
                continue
            print(seperator)
            print(para)
        print()

    return paras


def extract_relevant_paras(paras, verbosity=0):
    """
    Extracts relevant paragraphs from the list of all paragraphs.

    Parameters
    ----------
    paras : list
        A list of split paragraphs.
    verbosity : int, optional
        The level of print statements on the output console. The default is 0.

    Returns
    -------
    cleaned_paras : list
        A list of relevant paragraphs.
    definitions : str
        Definition text as extracted by the "clean_definitions_pattern", which
        is to be appended to other definitons in the License text if any.

    """
    cleaned_paras = list()
    definitions = ""

    clean_definitions_pattern = r"""\".{0,20}\".{0,40}(mean|include|refer)s?"""

    if verbosity != 0:
        print(seperator)
        print(seperator)
        print("Following paragraphs were considered unnecessary and removed:")
    for para in paras:
        if not para.strip():
            continue
        if re.search(clean_definitions_pattern, para):
            definitions += para + "\n\n"
            if verbosity != 0:
                print(seperator)
                print(para)
        else:
            cleaned_paras.append(para)
    if verbosity != 0:
        print()

    definitions = definitions.strip()

    return cleaned_paras, definitions


def get_all_caps(text, verbosity=0):
    """
    Extracts text with all caps content from the License text.

    Parameters
    ----------
    text : str
        Raw License text.
    verbosity : int, optional
        The level of print statements on the output console. The default is 0.

    Returns
    -------
    text : str
        License text with all caps sentences removed.
    all_caps : list
        A list of all caps sentences from the License text.

    """
    all_caps_pattern = r"([^a-z\n]{50,})"
    all_caps = list()
    text = re.sub(all_caps_pattern, lambda m: all_caps.append(m.group(1)), text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    if all_caps and verbosity != 0:
        print(seperator)
        print(seperator)
        print("Following all caps were removed from the text:")
        print(all_caps)
        print()
    return text, all_caps


def get_exeptions(paras, remove_exceptions, verbosity=0):
    """
    Extracts a list of exceptions from the License text.

    Parameters
    ----------
    paras : list
        A list of paragraphs from the License text.
    remove_exceptions : bool
        Toggles whether or not to remove exceptions from the cleaned license
        text before summarization.
    verbosity : int, optional
        The level of print statements on the output console. The default is 0.

    Returns
    -------
    non_exception_paras : list
        A list of all paragraphs not containing exceptions from the License text.
    exceptions : list
        A list of all paragraphs containing exceptions from the License text.

    """
    non_exception_paras = list()
    exceptions = list()

    for para in paras:
        if re.search("exception", para.lower()):
            exceptions.append(para)
            if not remove_exceptions:
                non_exception_paras.append(para)
        else:
            non_exception_paras.append(para)

    if exceptions and verbosity != 0:
        print(seperator)
        print(seperator)
        print("Following exceptions were found in the text:")
        for exception in exceptions:
            print(seperator)
            print(exception)
        print()

    return non_exception_paras, exceptions


def get_MIT_content(text):
    """
    Returns the content of the MIT-like-licenses segregated into categories like
    Copyright, main content, etc.

    Parameters
    ----------
    text : str
        Cleaned MIT License text.
    
    Returns
    -------
    dictionary
        A dictionary of content from the MIT license. Keys are the type of
        content and values are the License contents from License text.
    """
    paras = split_paras(text)

    mit_content = defaultdict(list)

    for para in paras:
        para = para.strip()
        if len(para) < 1:
            continue
        if len(para.split()) <= 10 and ("Licens" in para or "licens" in para) and "Copyright" not in para:
            mit_content["header"].append(para)
        elif "Copyright" in para:
            if "is hereby granted" in para:
                mit_content["copyright+content"].append(para)
            else:
                mit_content["copyright"].append(para)
        elif "Permission is hereby granted" in para:
            mit_content["content"].append(para)
        elif "The above copyright notice" in para or len(para.split()) < 18:
            mit_content["sentence"].append(para)
        elif get_all_caps(para)[1]:
            mit_content["all_cap"].append(para)
        else:
            mit_content["content"].append(para)

    for key, value in mit_content.items():
        mit_content[key] = "\n\n".join(value)

    return mit_content


def get_most_likely_license_type(text):
    """
    Returns the most likely license type based on Doc2Vec scores
    (similarity > 0.9).

    Parameters
    ----------
    text : str
        Raw License text.
    
    Returns
    -------
    str
        The type of the most likely license. "Not found" if no license score is
        above 0.9
    """

    try:
        from src.doc2vec import inference
    except:
        from doc2vec import inference

    top1_result = inference(text).loc[0, :]

    if top1_result["Scores"] > 0.9:
        return top1_result["License"]
    else:
        return "Not Found"


def clean_license_text(text, remove_exceptions=False, verbosity=0):
    """
    Cleans License text.

    Parameters
    ----------
    text : str
        Raw License text.
    remove_exceptions : bool, optional
        Toggles whether or not to remove exceptions from the cleaned license.
        The default is False.
    verbosity : int, optional
        The level of print statements on the output console. The default is 0.

    Returns
    -------
    text : str
        Cleaned License text.
    definitions : str
        Definitions extracted from the License text.
     exceptions : str
        Exceptions extracted from the License text.

    """

    if len(text) == 0:
        return text, "", ""

    text, author_details = extract_author_details(text, verbosity=verbosity)
    text = script_cleaner(text)
    text = preprocess_text(text)
    paras, definitions, exceptions = split_definitions_exceptions(
        text, remove_exceptions, verbosity=verbosity
    )
    text = PARA_BREAK.join(paras)
    text = character_cleaner(text)
    text = re.sub(PARA_BREAK, "\n\n", text)
    text = text.strip()

    if not isEnglish(text):
        if not isEnglish(" ".join(text.split()[-5:-1])):
            return "", "", ""

    exceptions = " ".join(exceptions)

    return text, definitions, exceptions