Spaces:
Runtime error
Runtime error
import re | |
import json | |
from bs4 import BeautifulSoup | |
from striprtf.striprtf import rtf_to_text | |
from collections import defaultdict | |
PARA_BREAK = "para___break" | |
seperator = "=" * 50 | |
verbosity = 0 | |
def extract_author_details(text, verbosity=0): | |
""" | |
Extracts important author information from the license text. | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
verbosity : int, optional | |
The level of print statements on the output console. The default is 0. | |
Returns | |
------- | |
text : str | |
License text with author details removed. | |
author_details : list | |
A list of important author details. | |
""" | |
author_details_pattern = r"(@(author|license|copyright|package).*)" | |
author_details = list() | |
text = re.sub(author_details_pattern, lambda m: author_details.append(m.group(1)), text) | |
if author_details and verbosity != 0: | |
print(seperator) | |
print(seperator) | |
print("Following author details were extracted:") | |
print(seperator) | |
print(author_details) | |
print() | |
return text, author_details | |
def php_cleaner(text): | |
""" | |
Cleans the license file in PHP format. | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
str | |
Cleaned License text with PHP script removed. | |
""" | |
try: | |
return re.findall("\/\*[\S\s]*?\*\/", text)[0] | |
except: | |
return "" | |
# return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0] | |
def html_cleaner(text): | |
""" | |
Cleans the license file in HTML format. | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
str | |
Cleaned License text with HTML script removed. | |
""" | |
soup = BeautifulSoup(text) | |
text = soup.body.text | |
if not text: | |
return "" | |
return text | |
def json_cleaner(text_dict): | |
""" | |
Cleans the license file in JSON format. | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
str | |
Cleaned License text with JSON format normalized to text. | |
""" | |
out = "" | |
for key in text_dict.keys(): | |
if key in ("description", "license"): | |
out += key | |
out += ": " | |
out += str(text_dict[key]) | |
out += ", " | |
return out | |
def rtf_cleaner(text): | |
""" | |
Cleans the license file in RTF format. | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
str | |
Cleaned License text with RTF script removed. | |
""" | |
return rtf_to_text(text) | |
def url_cleaner(text): | |
""" | |
Removes URLs from the License text. | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
str | |
Cleaned License text with URLs removed. | |
""" | |
return re.sub(r"\(?http\S+\)?", "", text) | |
def email_cleaner(text): | |
""" | |
Removes emails from the License text. | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
str | |
Cleaned License text with emails removed. | |
""" | |
return re.sub(r"\S{3,}@\S{2,}\.\S+", "", text) | |
def var_cleaner(text): | |
""" | |
Removes potential variable names from the License text. | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
str | |
Cleaned License text with variable names removed. | |
""" | |
text = re.sub(r"\$\w+", "", text) | |
text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text) | |
# text = re.sub(r"[a-zA-Z\(\)_'\"]+\.[a-zA-Z_]+", "", text) | |
return text | |
def character_cleaner(text): | |
""" | |
Removes unnecessary special characters from the License text. | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
str | |
Cleaned License text with some special characters removed. | |
""" | |
text = url_cleaner(text) | |
text = email_cleaner(text) | |
text = var_cleaner(text) | |
text = re.sub("\s*(;quot;|&)\s*", " ", text) | |
text = re.sub("[\n]{2,}", ". ", text) | |
text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text) | |
text = re.sub("[\. ]{2,}", ". ", text) | |
return text | |
def isEnglish(s): | |
""" | |
Checks whether the License text is in English or not. | |
Parameters | |
---------- | |
s : str | |
Raw License text. | |
Returns | |
------- | |
bool | |
True if complete License text is in English, False otherwise. | |
""" | |
try: | |
s.encode(encoding="utf-8").decode("ascii") | |
except UnicodeDecodeError: | |
return False | |
else: | |
return True | |
def split_definitions_exceptions(text, remove_exceptions, verbosity=0): | |
""" | |
Extract definitions from the License text | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
paras : list | |
A list of paragraphs from License text with definitions and exceptions | |
removed. | |
definitions : str | |
Definitions extracted from the License text. | |
exceptions : list | |
A list of paragraphs which contain exceptions . | |
""" | |
definitions = "" | |
if "Definitions" in text: | |
try: | |
def_pattern = r"([S|s]ection )?[0-9] ?[\.|-|–]? ?([A|a]dditional )?[D|d]efinitions" | |
after_def_pattern = r"\s+(Section )?[0-9]\.? [\.|-|–]? ?[A-Z][a-z]+" | |
def_pos = re.search(def_pattern, text).span() | |
other_start_pos = re.search(after_def_pattern, text[def_pos[1]:]).span()[0] | |
definitions = text[def_pos[0]: def_pos[1] + other_start_pos].strip() + "\n\n" | |
text = text[:def_pos[0]] + text[def_pos[1] + other_start_pos:] | |
except: | |
pass | |
paras, more_defs = extract_relevant_paras( | |
split_paras(text, verbosity=verbosity), | |
verbosity=verbosity | |
) | |
definitions += more_defs.strip() | |
paras, exceptions = get_exeptions(paras, remove_exceptions, verbosity=verbosity) | |
return paras, definitions, exceptions | |
def discard_text_after_end_tnc(text): | |
""" | |
Discards text after "END OF TERMS AND CONDITIONS" | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
str | |
License text with irrelavant information after "END OF TERMS AND CONDITIONS" removed. | |
""" | |
return text.split("END OF TERMS AND CONDITIONS")[0] | |
def clear_preamble(text): | |
""" | |
Cleans Preamble from the License text | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
text : str | |
License text with Preamble removed. | |
""" | |
preamble_pattern = "Preamble" | |
dist_and_mod_pattern = "distribution\s+and\s+modification\s+follow\.?" | |
if preamble_pattern in text: | |
preamble_split = text.split(preamble_pattern) | |
if len(preamble_split) != 2: | |
return text | |
try: | |
after_preamble_end = re.split(dist_and_mod_pattern, preamble_split[1])[1] | |
# TODO Why do we need this condition? | |
if len(preamble_split[0]) > 100: | |
text = preamble_split[0] + after_preamble_end.strip() | |
except: | |
pass | |
return text | |
def gnu_cleaner(text): | |
""" | |
Cleans GNU text such as discarding Preamble and text after end of terms | |
and conditions. | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
preamble_cleared_text : str | |
License text with irrelavant information in Preamble and text after end | |
of terms and conditions removed. | |
""" | |
before_end_tnc = discard_text_after_end_tnc(text) | |
preamble_cleared_text = clear_preamble(before_end_tnc) | |
return preamble_cleared_text | |
def preprocess_text(text): | |
""" | |
Preprocesses License text considering different License types. | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
text : str | |
Cleaned License text. | |
""" | |
# if most_likely_license_type in [ | |
# "GPL-3.0-only", | |
# "AGPL-3.0-only", | |
# "GPL-2.0-only", | |
# "LGPL-3.0-only", | |
# "LGPL-2.1-only", | |
# ]: | |
# # We need to take care of these cases too: | |
# # https://choosealicense.com/licenses/ofl-1.1/ | |
# # https://choosealicense.com/licenses/lodbl-1.0/ | |
# # https://choosealicense.com/licenses/odbl-1.0/ | |
# # https://choosealicense.com/licenses/lms-rl/ | |
# # https://choosealicense.com/licenses/lms-pl/ | |
# # https://choosealicense.com/licenses/lmpl-2.0/ | |
# # https://choosealicense.com/licenses/lppl-1.3c/ | |
# # https://choosealicense.com/licenses/eupl-1.2/ | |
# # https://choosealicense.com/licenses/eupl-1.1/ | |
# # https://choosealicense.com/licenses/epl-2.0/ | |
# # https://choosealicense.com/licenses/epl-1.0/ | |
# # https://choosealicense.com/licenses/ecl-2.0/ | |
# # https://choosealicense.com/licenses/cecill-2.1/ | |
# # https://choosealicense.com/licenses/cc-by-sa-4.0/ | |
# # https://choosealicense.com/licenses/cc-by-4.0/ | |
# # https://choosealicense.com/licenses/artistic-2.0/ | |
# # https://choosealicense.com/licenses/apache-2.0/ | |
# TODO This condition will not work, fix it: | |
if "GNU" in text or "Apache" in text: | |
text = gnu_cleaner(text) | |
return text | |
def clean_if_else(text): | |
""" | |
Removes specific if-else conditions from the License text | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
str | |
Cleaned License text with if-else conditions removed. | |
""" | |
return re.sub(r"#\bif[\s\S]+?#endif\s*", "", text).strip() | |
def clean_comments(text): | |
""" | |
Cleans specific comment formats from the License texts | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
str | |
Cleaned License text with comments conditions removed. | |
""" | |
return re.sub(r"[\`'\"]{3,}[\s\S]*?[\`'\"]{3,}", "", text).strip() | |
def script_cleaner(text): | |
""" | |
Cleans the script text from License text to extract the main content. | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
str | |
Cleaned License text without scripts. | |
""" | |
if "<?php" in text: | |
text = php_cleaner(text) | |
elif "</html>" in text: | |
text = html_cleaner(text) | |
elif text[0] == "{" and text[-1] == "}": | |
text = json_cleaner(json.loads(text)) | |
elif "\\rtf" in text: | |
text = rtf_cleaner(text) | |
if not text: | |
return "" | |
text = clean_if_else(text) | |
text = clean_comments(text) | |
return text | |
def split_paras(text, verbosity=0): | |
""" | |
Splits the text into paragraphs. | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
verbosity : int, optional | |
The level of print statements on the output console. The default is 0. | |
Returns | |
------- | |
paras : list | |
A list of split paragraphs. | |
""" | |
text = re.sub(r"\n{4,}", "\n"*4, text) | |
if len(re.findall("\n\n\n\n", text)) >= 2: | |
paras = text.split("\n\n\n\n") | |
paras = [re.sub(r"\n{1,3}", " ", para) for para in paras] | |
elif len(re.findall("\n\n", text)) >= 2: | |
paras = text.split("\n\n") | |
paras = [re.sub(r"\n", " ", para) for para in paras] | |
elif len(re.findall("\n", text)) >= 2: | |
paras = text.split("\n") | |
else: | |
paras = [text] | |
if verbosity != 0: | |
print(seperator) | |
print(seperator) | |
print("These are the split paras in the text:") | |
for para in paras: | |
if not para.strip(): | |
continue | |
print(seperator) | |
print(para) | |
print() | |
return paras | |
def extract_relevant_paras(paras, verbosity=0): | |
""" | |
Extracts relevant paragraphs from the list of all paragraphs. | |
Parameters | |
---------- | |
paras : list | |
A list of split paragraphs. | |
verbosity : int, optional | |
The level of print statements on the output console. The default is 0. | |
Returns | |
------- | |
cleaned_paras : list | |
A list of relevant paragraphs. | |
definitions : str | |
Definition text as extracted by the "clean_definitions_pattern", which | |
is to be appended to other definitons in the License text if any. | |
""" | |
cleaned_paras = list() | |
definitions = "" | |
# TODO This might be interesting to look into: | |
# https://choosealicense.com/licenses/eupl-1.2/ | |
clean_definitions_pattern = r"""\".{0,20}\".{0,40}(mean|include|refer)s?""" | |
if verbosity != 0: | |
print(seperator) | |
print(seperator) | |
print("Following paragraphs were considered unnecessary and removed:") | |
for para in paras: | |
if not para.strip(): | |
continue | |
if re.search(clean_definitions_pattern, para): | |
definitions += para + "\n\n" | |
if verbosity != 0: | |
print(seperator) | |
print(para) | |
else: | |
cleaned_paras.append(para) | |
if verbosity != 0: | |
print() | |
definitions = definitions.strip() | |
return cleaned_paras, definitions | |
def get_all_caps(text, verbosity=0): | |
""" | |
Extracts text with all caps content from the License text. | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
verbosity : int, optional | |
The level of print statements on the output console. The default is 0. | |
Returns | |
------- | |
text : str | |
License text with all caps sentences removed. | |
all_caps : list | |
A list of all caps sentences from the License text. | |
""" | |
all_caps_pattern = r"([^a-z\n]{50,})" | |
all_caps = list() | |
text = re.sub(all_caps_pattern, lambda m: all_caps.append(m.group(1)), text) | |
text = re.sub(r"\n{3,}", "\n\n", text) | |
if all_caps and verbosity != 0: | |
print(seperator) | |
print(seperator) | |
print("Following all caps were removed from the text:") | |
print(all_caps) | |
print() | |
return text, all_caps | |
def get_exeptions(paras, remove_exceptions, verbosity=0): | |
""" | |
Extracts a list of exceptions from the License text. | |
Parameters | |
---------- | |
paras : list | |
A list of paragraphs from the License text. | |
verbosity : int, optional | |
The level of print statements on the output console. The default is 0. | |
Returns | |
------- | |
non_exception_paras : list | |
A list of all paragraphs not containing exceptions from the License text. | |
exceptions : list | |
A list of all paragraphs containing exceptions from the License text. | |
""" | |
non_exception_paras = list() | |
exceptions = list() | |
for para in paras: | |
if re.search("exception", para.lower()): | |
exceptions.append(para) | |
if not remove_exceptions: | |
non_exception_paras.append(para) | |
else: | |
non_exception_paras.append(para) | |
if exceptions and verbosity != 0: | |
print(seperator) | |
print(seperator) | |
print("Following exceptions were found in the text:") | |
for exception in exceptions: | |
print(seperator) | |
print(exception) | |
print() | |
return non_exception_paras, exceptions | |
def get_MIT_content(text): | |
""" | |
Returns the content of the MIT-like-licenses segregated into categories like | |
Copyright, main content, etc. | |
Parameters | |
---------- | |
text : str | |
Cleaned MIT License text. | |
Returns | |
------- | |
dictionary | |
A dictionary of content from the MIT license. Keys are the type of | |
content and values are the License contents from License text. | |
""" | |
paras = split_paras(text) | |
mit_content = defaultdict(list) | |
for para in paras: | |
para = para.strip() | |
if len(para) < 1: | |
continue | |
if len(para.split()) <= 10 and ("Licens" in para or "licens" in para) and "Copyright" not in para: | |
mit_content["header"].append(para) | |
elif "Copyright" in para: | |
if "is hereby granted" in para: | |
mit_content["copyright+content"].append(para) | |
else: | |
mit_content["copyright"].append(para) | |
elif "Permission is hereby granted" in para: | |
mit_content["content"].append(para) | |
elif "The above copyright notice" in para or len(para.split()) < 18: | |
mit_content["sentence"].append(para) | |
elif get_all_caps(para)[1]: | |
mit_content["all_cap"].append(para) | |
else: | |
mit_content["content"].append(para) | |
for key, value in mit_content.items(): | |
mit_content[key] = "\n\n".join(value) | |
return mit_content | |
def get_most_likely_license_type(text): | |
""" | |
Returns the most likely license type based on Doc2Vec scores | |
(similarity > 0.9). | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
Returns | |
------- | |
str | |
The type of the most likely license. "Not found" if no license score is | |
above 0.9 | |
""" | |
try: | |
from src.doc2vec import inference | |
except: | |
from doc2vec import inference | |
top1_result = inference(text).loc[0, :] | |
if top1_result["Scores"] > 0.9: | |
return top1_result["License"] | |
else: | |
return "Not Found" | |
def clean_license_text(text, remove_exceptions=False, verbosity=0): | |
""" | |
Cleans License text. | |
Parameters | |
---------- | |
text : str | |
Raw License text. | |
verbosity : int, optional | |
The level of print statements on the output console. The default is 0. | |
Returns | |
------- | |
text : str | |
Cleaned License text. | |
definitions : str | |
Definitions extracted from the License text. | |
""" | |
if len(text) == 0: | |
return text | |
most_likely_license_type = get_most_likely_license_type(text) | |
text, author_details = extract_author_details(text, verbosity=verbosity) | |
text = script_cleaner(text) | |
text = preprocess_text(text) | |
paras, definitions, exceptions = split_definitions_exceptions( | |
text, remove_exceptions, verbosity=verbosity | |
) | |
text = PARA_BREAK.join(paras) | |
text = character_cleaner(text) | |
text = re.sub(PARA_BREAK, "\n\n", text) | |
text = text.strip() | |
# TODO Need to update this too: | |
if not isEnglish(text): | |
if not isEnglish(" ".join(text.split()[-5:-1])): | |
return "", "" | |
if "MIT" in most_likely_license_type: | |
mit_content = get_MIT_content(text) | |
if verbosity != 0: | |
print("This is likely an MIT License!") | |
print(mit_content) | |
return text, definitions | |
""" | |
Notes: | |
1. Try splitting each para by "\n", if len == 1 and len(para) < 100 (or something) | |
-> Merge with the next para | |
Ex. "8. Termination." | |
""" |