Spaces:
Runtime error
Runtime error
import numpy as np | |
import pandas as pd | |
import glob | |
from collections import defaultdict | |
data_directory = "data/" | |
gold_licenses_data = "data/gold_licenses/" | |
index_col = "license_name" | |
COMM_USE = "commercial-use" | |
DIST = "distribution" | |
MODS = "modifications" | |
PAT_USE = "patent-use" | |
PVT_USE = "private-use" | |
DISC_SRC = "disclose-source" | |
INCL_CPYRT = "include-copyright" | |
INCL_CPYRT_SRC = "include-copyright--source" | |
NW_USE_DISC = "network-use-disclose" | |
SAME_LIC = "same-license" | |
SAME_LIC_FILE = "same-license--file" | |
SAME_LIC_LIB = "same-license--library" | |
DOC_CHNG = "document-changes" | |
LIABILITY = "liability" | |
TRDMRK_USE = "trademark-use" | |
WRNTY = "warranty" | |
PERMISSIONS = "permissions" | |
CONDITIONS = "conditions" | |
LIMITATIONS = "limitations" | |
SUMMARY = "summary" | |
summary_terms_dict = { | |
COMM_USE: "The licensed material and derivatives may be used for commercial purposes.", | |
DIST: "The licensed material may be distributed.", | |
MODS: "The licensed material may be modified.", | |
PAT_USE: { | |
PERMISSIONS: "This license provides an express grant of patent rights from contributors.", | |
LIMITATIONS: "This license explicitly states that it does NOT grant any rights in the patents of contributors." | |
}, | |
PVT_USE: "The licensed material may be used and modified in private.", | |
DISC_SRC: "Source code must be made available when the licensed material is distributed.", | |
INCL_CPYRT: "A copy of the license and copyright notice must be included with the licensed material.", | |
INCL_CPYRT_SRC: "A copy of the license and copyright notice must be included with the licensed material in source form, but is not required for binaries.", | |
NW_USE_DISC: "Users who interact with the licensed material via network are given the right to receive a copy of the source code.", | |
SAME_LIC: "Modifications must be released under the same license when distributing the licensed material. In some cases a similar or related license may be used.", | |
SAME_LIC_FILE: "Modifications of existing files must be released under the same license when distributing the licensed material. In some cases a similar or related license may be used.", | |
SAME_LIC_LIB: "Modifications must be released under the same license when distributing the licensed material. In some cases a similar or related license may be used, or this condition may not apply to works that use the licensed material as a library.", | |
DOC_CHNG: "Changes made to the licensed material must be documented.", | |
LIABILITY: "This license includes a limitation of liability.", | |
TRDMRK_USE: "This license explicitly states that it does NOT grant trademark rights, even though licenses without such a statement probably do not grant any implicit trademark rights.", | |
WRNTY: "This license explicitly states that it does NOT provide any warranty." | |
} | |
def clean_data(text): | |
""" | |
A placeholder method which will be replaced with the original "clean_data" | |
method. | |
""" | |
return text | |
def read_file(file_path): | |
""" | |
Reads data from the given file path | |
Parameters | |
---------- | |
file_path : str | |
Path of file from where data is to be read. | |
Returns | |
------- | |
content : str | |
Data read from the file at given file_path. | |
""" | |
with open(file_path, "r", encoding="utf-8") as f: | |
content = f.read() | |
return content | |
def augment_summary(license_data): | |
for index, row in license_data.iterrows(): | |
row[SUMMARY] = row[SUMMARY].strip() | |
if row[COMM_USE] == PERMISSIONS: | |
row[SUMMARY] += f" {summary_terms_dict[COMM_USE]}" | |
if row[DIST] == PERMISSIONS: | |
row[SUMMARY] += f" {summary_terms_dict[DIST]}" | |
if row[MODS] == PERMISSIONS: | |
row[SUMMARY] += f" {summary_terms_dict[MODS]}" | |
if row[PAT_USE] == PERMISSIONS: | |
row[SUMMARY] += f" {summary_terms_dict[PAT_USE][PERMISSIONS]}" | |
elif row[PAT_USE] == LIMITATIONS: | |
row[SUMMARY] += f" {summary_terms_dict[PAT_USE][LIMITATIONS]}" | |
if row[PVT_USE] == PERMISSIONS: | |
row[SUMMARY] += f" {summary_terms_dict[PVT_USE]}" | |
if row[DISC_SRC] == CONDITIONS: | |
row[SUMMARY] += f" {summary_terms_dict[DISC_SRC]}" | |
if row[INCL_CPYRT] == CONDITIONS: | |
row[SUMMARY] += f" {summary_terms_dict[INCL_CPYRT]}" | |
if row[INCL_CPYRT_SRC] == CONDITIONS: | |
row[SUMMARY] += f" {summary_terms_dict[INCL_CPYRT_SRC]}" | |
if row[NW_USE_DISC] == PERMISSIONS: | |
row[SUMMARY] += f" {summary_terms_dict[NW_USE_DISC]}" | |
if row[SAME_LIC] == CONDITIONS: | |
row[SUMMARY] += f" {summary_terms_dict[SAME_LIC]}" | |
if row[SAME_LIC_FILE] == CONDITIONS: | |
row[SUMMARY] += f" {summary_terms_dict[SAME_LIC_FILE]}" | |
if row[SAME_LIC_LIB] == CONDITIONS: | |
row[SUMMARY] += f" {summary_terms_dict[SAME_LIC_LIB]}" | |
if row[DOC_CHNG] == CONDITIONS: | |
row[SUMMARY] += f" {summary_terms_dict[DOC_CHNG]}" | |
if row[LIABILITY] == LIMITATIONS: | |
row[SUMMARY] += f" {summary_terms_dict[LIABILITY]}" | |
if row[TRDMRK_USE] == PERMISSIONS: | |
row[SUMMARY] += f" {summary_terms_dict[TRDMRK_USE]}" | |
if row[WRNTY] == PERMISSIONS: | |
row[SUMMARY] += f" {summary_terms_dict[WRNTY]}" | |
return license_data | |
def read_license_data(labels_file="choosealicense_appendix_labels.csv", drop_summary=False): | |
""" | |
Reads data from Text and Summary File and stores it as a dictionary of | |
dictionaries. | |
Returns | |
------- | |
data_dict : dict | |
A dictionary of dictionaries with keys as the License name and values | |
as dictionaries with keys "summary" and "text" and values as the | |
corresponding summaries and license texts respectively. | |
""" | |
files = glob.glob(gold_licenses_data + "*") | |
if not files: | |
files = glob.glob(f"../{gold_licenses_data}" + "*") | |
if not files: | |
print("Gold licenses not found, please check the path again!") | |
return None | |
data_dict = defaultdict(dict) | |
for file_path in files: | |
if "\\" in file_path: | |
split_by = "\\" | |
else: | |
split_by = "/" | |
if file_path.endswith(".summary"): | |
file_name = file_path.split(split_by)[-1][:-8] | |
data_dict[file_name]["summary"] = read_file(file_path) | |
elif file_path.endswith(".txt"): | |
file_name = file_path.split(split_by)[-1][:-4] | |
data_dict[file_name]["text"] = clean_data(read_file(file_path)) | |
summary_df = pd.DataFrame(data_dict).T | |
try: | |
labels_df = pd.read_csv(data_directory + labels_file, index_col=index_col) | |
except: | |
try: | |
labels_df = pd.read_csv(f"../{data_directory}" + labels_file, index_col=index_col) | |
except: | |
print("Labels file not found, please check the path again!") | |
return None | |
merged_data = labels_df.join(summary_df).drop(columns=["spdx_id"]) | |
if drop_summary: | |
merged_data = merged_data.drop(columns=["summary"]) | |
return merged_data | |
def read_license_summary_data(aug_summary=False): | |
license_data = read_license_data() | |
if aug_summary: | |
license_data = augment_summary(license_data) | |
license_data = license_data[["text", "summary"]] | |
return license_data | |
def read_license_labels_data(): | |
return read_license_data().drop(columns=["summary"]) | |
def fix_labels(license_data): | |
permissions_map = { | |
"permissions": 0 | |
} | |
conditions_map = { | |
np.nan: 0, | |
"conditions": 1 | |
} | |
limitations_map = { | |
np.nan: 0, | |
"limitations": 1 | |
} | |
permissions_limitations_map = { | |
np.nan: 0, | |
"permissions": 1, | |
"limitations": 2 | |
} | |
# permissive_not_permissive_map = { | |
# np.nan: 0, | |
# "permissive": 1, | |
# "not_permissive": 2 | |
# } | |
permissions_columns = [ | |
"commercial-use", | |
"distribution", | |
"modifications", | |
"private-use" | |
] | |
conditions_columns = [ | |
"disclose-source", | |
"document-changes", | |
"include-copyright", | |
"include-copyright--source", | |
"network-use-disclose", | |
"same-license", | |
"same-license--file", | |
"same-license--library" | |
] | |
limitations_columns = [ | |
"liability", | |
"trademark-use", | |
"warranty" | |
] | |
permissions_limitations_columns = [ | |
"patent-use" | |
] | |
# permissive_not_permissive_columns = [ | |
# "GTLC_Permissive" | |
# ] | |
license_data[permissions_columns] = license_data[permissions_columns].replace(permissions_map) | |
license_data[conditions_columns] = license_data[conditions_columns].replace(conditions_map) | |
license_data[limitations_columns] = license_data[limitations_columns].replace(limitations_map) | |
license_data[permissions_limitations_columns] = license_data[permissions_limitations_columns].replace(permissions_limitations_map) | |
# license_data[permissive_not_permissive_columns] = license_data[permissive_not_permissive_columns].replace(permissive_not_permissive_map) | |
return license_data |