Spaces:

nihaldsouza1
/

clearlydefined_license_summarizer

Runtime error

File size: 9,149 Bytes

import numpy as np
import pandas as pd

import glob
from collections import defaultdict


data_directory = "data/"
gold_licenses_data = "data/gold_licenses/"
index_col = "license_name"


COMM_USE = "commercial-use"
DIST = "distribution"
MODS = "modifications"
PAT_USE = "patent-use"
PVT_USE = "private-use"
DISC_SRC = "disclose-source"
INCL_CPYRT = "include-copyright"
INCL_CPYRT_SRC = "include-copyright--source"
NW_USE_DISC = "network-use-disclose"
SAME_LIC = "same-license"
SAME_LIC_FILE = "same-license--file"
SAME_LIC_LIB = "same-license--library"
DOC_CHNG = "document-changes"
LIABILITY = "liability"
TRDMRK_USE = "trademark-use"
WRNTY = "warranty"

PERMISSIONS = "permissions"
CONDITIONS = "conditions"
LIMITATIONS = "limitations"

SUMMARY = "summary"


summary_terms_dict = {
    COMM_USE: "The licensed material and derivatives may be used for commercial purposes.",
    DIST: "The licensed material may be distributed.",
    MODS: "The licensed material may be modified.",
    PAT_USE: {
        PERMISSIONS: "This license provides an express grant of patent rights from contributors.",
        LIMITATIONS: "This license explicitly states that it does NOT grant any rights in the patents of contributors."
    },
    PVT_USE: "The licensed material may be used and modified in private.",
    DISC_SRC: "Source code must be made available when the licensed material is distributed.",
    INCL_CPYRT: "A copy of the license and copyright notice must be included with the licensed material.",
    INCL_CPYRT_SRC: "A copy of the license and copyright notice must be included with the licensed material in source form, but is not required for binaries.",
    NW_USE_DISC: "Users who interact with the licensed material via network are given the right to receive a copy of the source code.",
    SAME_LIC: "Modifications must be released under the same license when distributing the licensed material. In some cases a similar or related license may be used.",
    SAME_LIC_FILE: "Modifications of existing files must be released under the same license when distributing the licensed material. In some cases a similar or related license may be used.",
    SAME_LIC_LIB: "Modifications must be released under the same license when distributing the licensed material. In some cases a similar or related license may be used, or this condition may not apply to works that use the licensed material as a library.",
    DOC_CHNG: "Changes made to the licensed material must be documented.",
    LIABILITY: "This license includes a limitation of liability.",
    TRDMRK_USE: "This license explicitly states that it does NOT grant trademark rights, even though licenses without such a statement probably do not grant any implicit trademark rights.",
    WRNTY: "This license explicitly states that it does NOT provide any warranty."
}



def clean_data(text):
    """
    A placeholder method which will be replaced with the original "clean_data"
    method.

    """
    return text


def read_file(file_path):
    """
    Reads data from the given file path

    Parameters
    ----------
    file_path : str
        Path of file from where data is to be read.

    Returns
    -------
    content : str
        Data read from the file at given file_path.

    """
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
    return content


def augment_summary(license_data):
    for index, row in license_data.iterrows():
        row[SUMMARY] = row[SUMMARY].strip()

        if row[COMM_USE] == PERMISSIONS:
            row[SUMMARY] += f" {summary_terms_dict[COMM_USE]}"
        if row[DIST] == PERMISSIONS:
            row[SUMMARY] += f" {summary_terms_dict[DIST]}"
        if row[MODS] == PERMISSIONS:
            row[SUMMARY] += f" {summary_terms_dict[MODS]}"
        if row[PAT_USE] == PERMISSIONS:
            row[SUMMARY] += f" {summary_terms_dict[PAT_USE][PERMISSIONS]}"
        elif row[PAT_USE] == LIMITATIONS:
            row[SUMMARY] += f" {summary_terms_dict[PAT_USE][LIMITATIONS]}"
        if row[PVT_USE] == PERMISSIONS:
            row[SUMMARY] += f" {summary_terms_dict[PVT_USE]}"
        if row[DISC_SRC] == CONDITIONS:
            row[SUMMARY] += f" {summary_terms_dict[DISC_SRC]}"
        if row[INCL_CPYRT] == CONDITIONS:
            row[SUMMARY] += f" {summary_terms_dict[INCL_CPYRT]}"
        if row[INCL_CPYRT_SRC] == CONDITIONS:
            row[SUMMARY] += f" {summary_terms_dict[INCL_CPYRT_SRC]}"
        if row[NW_USE_DISC] == PERMISSIONS:
            row[SUMMARY] += f" {summary_terms_dict[NW_USE_DISC]}"
        if row[SAME_LIC] == CONDITIONS:
            row[SUMMARY] += f" {summary_terms_dict[SAME_LIC]}"
        if row[SAME_LIC_FILE] == CONDITIONS:
            row[SUMMARY] += f" {summary_terms_dict[SAME_LIC_FILE]}"
        if row[SAME_LIC_LIB] == CONDITIONS:
            row[SUMMARY] += f" {summary_terms_dict[SAME_LIC_LIB]}"
        if row[DOC_CHNG] == CONDITIONS:
            row[SUMMARY] += f" {summary_terms_dict[DOC_CHNG]}"
        if row[LIABILITY] == LIMITATIONS:
            row[SUMMARY] += f" {summary_terms_dict[LIABILITY]}"
        if row[TRDMRK_USE] == PERMISSIONS:
            row[SUMMARY] += f" {summary_terms_dict[TRDMRK_USE]}"
        if row[WRNTY] == PERMISSIONS:
            row[SUMMARY] += f" {summary_terms_dict[WRNTY]}"
    
    return license_data


def read_license_data(labels_file="choosealicense_appendix_labels.csv", drop_summary=False):
    """
    Reads data from Text and Summary File and stores it as a dictionary of
    dictionaries.

    Returns
    -------
    data_dict : dict
        A dictionary of dictionaries with keys as the License name and values
        as dictionaries with keys "summary" and "text" and values as the
        corresponding summaries and license texts respectively.

    """

    files = glob.glob(gold_licenses_data + "*")
    if not files:
        files = glob.glob(f"../{gold_licenses_data}" + "*")
    if not files:
        print("Gold licenses not found, please check the path again!")
        return None

    data_dict = defaultdict(dict)

    for file_path in files:
        if "\\" in file_path:
            split_by = "\\"
        else:
            split_by = "/"

        if file_path.endswith(".summary"):
            file_name = file_path.split(split_by)[-1][:-8]
            data_dict[file_name]["summary"] = read_file(file_path)
        elif file_path.endswith(".txt"):
            file_name = file_path.split(split_by)[-1][:-4]
            data_dict[file_name]["text"] = clean_data(read_file(file_path))
    
    summary_df = pd.DataFrame(data_dict).T

    try:
        labels_df = pd.read_csv(data_directory + labels_file, index_col=index_col)
    except:
        try:
            labels_df = pd.read_csv(f"../{data_directory}" + labels_file, index_col=index_col)
        except:
            print("Labels file not found, please check the path again!")
            return None

    merged_data = labels_df.join(summary_df).drop(columns=["spdx_id"])

    if drop_summary:
        merged_data = merged_data.drop(columns=["summary"])

    return merged_data


def read_license_summary_data(aug_summary=False):
    license_data = read_license_data()
    if aug_summary:
        license_data = augment_summary(license_data)
    license_data = license_data[["text", "summary"]]

    return license_data


def read_license_labels_data():
    return read_license_data().drop(columns=["summary"])


def fix_labels(license_data):
    permissions_map = {
        "permissions": 0
    }

    conditions_map = {
        np.nan: 0,
        "conditions": 1
    }

    limitations_map = {
        np.nan: 0,
        "limitations": 1
    }

    permissions_limitations_map = {
        np.nan: 0,
        "permissions": 1,
        "limitations": 2
    }

    # permissive_not_permissive_map = {
    #     np.nan: 0,
    #     "permissive": 1,
    #     "not_permissive": 2
    # }

    permissions_columns = [
        "commercial-use",
        "distribution",
        "modifications",
        "private-use"
    ]
    conditions_columns = [
        "disclose-source",
        "document-changes",
        "include-copyright",
        "include-copyright--source",
        "network-use-disclose",
        "same-license",
        "same-license--file",
        "same-license--library"
    ]

    limitations_columns = [
        "liability",
        "trademark-use",
        "warranty"
    ]

    permissions_limitations_columns = [
        "patent-use"
    ]

    # permissive_not_permissive_columns = [
    #     "GTLC_Permissive"
    # ]

    license_data[permissions_columns] = license_data[permissions_columns].replace(permissions_map)
    license_data[conditions_columns] = license_data[conditions_columns].replace(conditions_map)
    license_data[limitations_columns] = license_data[limitations_columns].replace(limitations_map)
    license_data[permissions_limitations_columns] = license_data[permissions_limitations_columns].replace(permissions_limitations_map)
    # license_data[permissive_not_permissive_columns] = license_data[permissive_not_permissive_columns].replace(permissive_not_permissive_map)

    return license_data