Nihal D'Souza
Updating clean function
95fa18f
raw
history blame
18.9 kB
import re
import json
from bs4 import BeautifulSoup
from striprtf.striprtf import rtf_to_text
from collections import defaultdict
PARA_BREAK = "para___break"
seperator = "=" * 50
verbosity = 0
def extract_author_details(text, verbosity=0):
"""
Extracts important author information from the license text.
Parameters
----------
text : str
Raw License text.
verbosity : int, optional
The level of print statements on the output console. The default is 0.
Returns
-------
text : str
License text with author details removed.
author_details : list
A list of important author details.
"""
author_details_pattern = r"(@(author|license|copyright|package).*)"
author_details = list()
text = re.sub(author_details_pattern, lambda m: author_details.append(m.group(1)), text)
if author_details and verbosity != 0:
print(seperator)
print(seperator)
print("Following author details were extracted:")
print(seperator)
print(author_details)
print()
return text, author_details
def php_cleaner(text):
"""
Cleans the license file in PHP format.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with PHP script removed.
"""
try:
return re.findall("\/\*[\S\s]*?\*\/", text)[0]
except:
return ""
# return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0]
def html_cleaner(text):
"""
Cleans the license file in HTML format.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with HTML script removed.
"""
soup = BeautifulSoup(text)
text = soup.body.text
if not text:
return ""
return text
def json_cleaner(text_dict):
"""
Cleans the license file in JSON format.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with JSON format normalized to text.
"""
out = ""
for key in text_dict.keys():
if key in ("description", "license"):
out += key
out += ": "
out += str(text_dict[key])
out += ", "
return out
def rtf_cleaner(text):
"""
Cleans the license file in RTF format.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with RTF script removed.
"""
return rtf_to_text(text)
def url_cleaner(text):
"""
Removes URLs from the License text.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with URLs removed.
"""
return re.sub(r"\(?http\S+\)?", "", text)
def email_cleaner(text):
"""
Removes emails from the License text.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with emails removed.
"""
return re.sub(r"\S{3,}@\S{2,}\.\S+", "", text)
def var_cleaner(text):
"""
Removes potential variable names from the License text.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with variable names removed.
"""
text = re.sub(r"\$\w+", "", text)
text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text)
# text = re.sub(r"[a-zA-Z\(\)_'\"]+\.[a-zA-Z_]+", "", text)
return text
def character_cleaner(text):
"""
Removes unnecessary special characters from the License text.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with some special characters removed.
"""
text = url_cleaner(text)
text = email_cleaner(text)
text = var_cleaner(text)
text = re.sub("\s*(;quot;|&amp)\s*", " ", text)
text = re.sub("[\n]{2,}", ". ", text)
text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text)
text = re.sub("[\. ]{2,}", ". ", text)
return text
def isEnglish(s):
"""
Checks whether the License text is in English or not.
Parameters
----------
s : str
Raw License text.
Returns
-------
bool
True if complete License text is in English, False otherwise.
"""
try:
s.encode(encoding="utf-8").decode("ascii")
except UnicodeDecodeError:
return False
else:
return True
def split_definitions_exceptions(text, remove_exceptions, verbosity=0):
"""
Extract definitions from the License text
Parameters
----------
text : str
Raw License text.
Returns
-------
paras : list
A list of paragraphs from License text with definitions and exceptions
removed.
definitions : str
Definitions extracted from the License text.
exceptions : list
A list of paragraphs which contain exceptions .
"""
definitions = ""
if "Definitions" in text:
try:
def_pattern = r"([S|s]ection )?[0-9] ?[\.|-|–]? ?([A|a]dditional )?[D|d]efinitions"
after_def_pattern = r"\s+(Section )?[0-9]\.? [\.|-|–]? ?[A-Z][a-z]+"
def_pos = re.search(def_pattern, text).span()
other_start_pos = re.search(after_def_pattern, text[def_pos[1]:]).span()[0]
definitions = text[def_pos[0]: def_pos[1] + other_start_pos].strip() + "\n\n"
text = text[:def_pos[0]] + text[def_pos[1] + other_start_pos:]
except:
pass
paras, more_defs = extract_relevant_paras(
split_paras(text, verbosity=verbosity),
verbosity=verbosity
)
definitions += more_defs.strip()
paras, exceptions = get_exeptions(paras, remove_exceptions, verbosity=verbosity)
return paras, definitions, exceptions
def discard_text_after_end_tnc(text):
"""
Discards text after "END OF TERMS AND CONDITIONS"
Parameters
----------
text : str
Raw License text.
Returns
-------
str
License text with irrelavant information after "END OF TERMS AND CONDITIONS" removed.
"""
return text.split("END OF TERMS AND CONDITIONS")[0]
def clear_preamble(text):
"""
Cleans Preamble from the License text
Parameters
----------
text : str
Raw License text.
Returns
-------
text : str
License text with Preamble removed.
"""
preamble_pattern = "Preamble"
dist_and_mod_pattern = "distribution\s+and\s+modification\s+follow\.?"
if preamble_pattern in text:
preamble_split = text.split(preamble_pattern)
if len(preamble_split) != 2:
return text
try:
after_preamble_end = re.split(dist_and_mod_pattern, preamble_split[1])[1]
# TODO Why do we need this condition?
if len(preamble_split[0]) > 100:
text = preamble_split[0] + after_preamble_end.strip()
except:
pass
return text
def gnu_cleaner(text):
"""
Cleans GNU text such as discarding Preamble and text after end of terms
and conditions.
Parameters
----------
text : str
Raw License text.
Returns
-------
preamble_cleared_text : str
License text with irrelavant information in Preamble and text after end
of terms and conditions removed.
"""
before_end_tnc = discard_text_after_end_tnc(text)
preamble_cleared_text = clear_preamble(before_end_tnc)
return preamble_cleared_text
def preprocess_text(text):
"""
Preprocesses License text considering different License types.
Parameters
----------
text : str
Raw License text.
Returns
-------
text : str
Cleaned License text.
"""
# if most_likely_license_type in [
# "GPL-3.0-only",
# "AGPL-3.0-only",
# "GPL-2.0-only",
# "LGPL-3.0-only",
# "LGPL-2.1-only",
# ]:
# # We need to take care of these cases too:
# # https://choosealicense.com/licenses/ofl-1.1/
# # https://choosealicense.com/licenses/lodbl-1.0/
# # https://choosealicense.com/licenses/odbl-1.0/
# # https://choosealicense.com/licenses/lms-rl/
# # https://choosealicense.com/licenses/lms-pl/
# # https://choosealicense.com/licenses/lmpl-2.0/
# # https://choosealicense.com/licenses/lppl-1.3c/
# # https://choosealicense.com/licenses/eupl-1.2/
# # https://choosealicense.com/licenses/eupl-1.1/
# # https://choosealicense.com/licenses/epl-2.0/
# # https://choosealicense.com/licenses/epl-1.0/
# # https://choosealicense.com/licenses/ecl-2.0/
# # https://choosealicense.com/licenses/cecill-2.1/
# # https://choosealicense.com/licenses/cc-by-sa-4.0/
# # https://choosealicense.com/licenses/cc-by-4.0/
# # https://choosealicense.com/licenses/artistic-2.0/
# # https://choosealicense.com/licenses/apache-2.0/
# TODO This condition will not work, fix it:
if "GNU" in text or "Apache" in text:
text = gnu_cleaner(text)
return text
def clean_if_else(text):
"""
Removes specific if-else conditions from the License text
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with if-else conditions removed.
"""
return re.sub(r"#\bif[\s\S]+?#endif\s*", "", text).strip()
def clean_comments(text):
"""
Cleans specific comment formats from the License texts
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text with comments conditions removed.
"""
return re.sub(r"[\`'\"]{3,}[\s\S]*?[\`'\"]{3,}", "", text).strip()
def script_cleaner(text):
"""
Cleans the script text from License text to extract the main content.
Parameters
----------
text : str
Raw License text.
Returns
-------
str
Cleaned License text without scripts.
"""
if "<?php" in text:
text = php_cleaner(text)
elif "</html>" in text:
text = html_cleaner(text)
elif text[0] == "{" and text[-1] == "}":
text = json_cleaner(json.loads(text))
elif "\\rtf" in text:
text = rtf_cleaner(text)
if not text:
return ""
text = clean_if_else(text)
text = clean_comments(text)
return text
def split_paras(text, verbosity=0):
"""
Splits the text into paragraphs.
Parameters
----------
text : str
Raw License text.
verbosity : int, optional
The level of print statements on the output console. The default is 0.
Returns
-------
paras : list
A list of split paragraphs.
"""
text = re.sub(r"\n{4,}", "\n"*4, text)
if len(re.findall("\n\n\n\n", text)) >= 2:
paras = text.split("\n\n\n\n")
paras = [re.sub(r"\n{1,3}", " ", para) for para in paras]
elif len(re.findall("\n\n", text)) >= 2:
paras = text.split("\n\n")
paras = [re.sub(r"\n", " ", para) for para in paras]
elif len(re.findall("\n", text)) >= 2:
paras = text.split("\n")
else:
paras = [text]
if verbosity != 0:
print(seperator)
print(seperator)
print("These are the split paras in the text:")
for para in paras:
if not para.strip():
continue
print(seperator)
print(para)
print()
return paras
def extract_relevant_paras(paras, verbosity=0):
"""
Extracts relevant paragraphs from the list of all paragraphs.
Parameters
----------
paras : list
A list of split paragraphs.
verbosity : int, optional
The level of print statements on the output console. The default is 0.
Returns
-------
cleaned_paras : list
A list of relevant paragraphs.
definitions : str
Definition text as extracted by the "clean_definitions_pattern", which
is to be appended to other definitons in the License text if any.
"""
cleaned_paras = list()
definitions = ""
# TODO This might be interesting to look into:
# https://choosealicense.com/licenses/eupl-1.2/
clean_definitions_pattern = r"""\".{0,20}\".{0,40}(mean|include|refer)s?"""
if verbosity != 0:
print(seperator)
print(seperator)
print("Following paragraphs were considered unnecessary and removed:")
for para in paras:
if not para.strip():
continue
if re.search(clean_definitions_pattern, para):
definitions += para + "\n\n"
if verbosity != 0:
print(seperator)
print(para)
else:
cleaned_paras.append(para)
if verbosity != 0:
print()
definitions = definitions.strip()
return cleaned_paras, definitions
def get_all_caps(text, verbosity=0):
"""
Extracts text with all caps content from the License text.
Parameters
----------
text : str
Raw License text.
verbosity : int, optional
The level of print statements on the output console. The default is 0.
Returns
-------
text : str
License text with all caps sentences removed.
all_caps : list
A list of all caps sentences from the License text.
"""
all_caps_pattern = r"([^a-z\n]{50,})"
all_caps = list()
text = re.sub(all_caps_pattern, lambda m: all_caps.append(m.group(1)), text)
text = re.sub(r"\n{3,}", "\n\n", text)
if all_caps and verbosity != 0:
print(seperator)
print(seperator)
print("Following all caps were removed from the text:")
print(all_caps)
print()
return text, all_caps
def get_exeptions(paras, remove_exceptions, verbosity=0):
"""
Extracts a list of exceptions from the License text.
Parameters
----------
paras : list
A list of paragraphs from the License text.
verbosity : int, optional
The level of print statements on the output console. The default is 0.
Returns
-------
non_exception_paras : list
A list of all paragraphs not containing exceptions from the License text.
exceptions : list
A list of all paragraphs containing exceptions from the License text.
"""
non_exception_paras = list()
exceptions = list()
for para in paras:
if re.search("exception", para.lower()):
exceptions.append(para)
if not remove_exceptions:
non_exception_paras.append(para)
else:
non_exception_paras.append(para)
if exceptions and verbosity != 0:
print(seperator)
print(seperator)
print("Following exceptions were found in the text:")
for exception in exceptions:
print(seperator)
print(exception)
print()
return non_exception_paras, exceptions
def get_MIT_content(text):
"""
Returns the content of the MIT-like-licenses segregated into categories like
Copyright, main content, etc.
Parameters
----------
text : str
Cleaned MIT License text.
Returns
-------
dictionary
A dictionary of content from the MIT license. Keys are the type of
content and values are the License contents from License text.
"""
paras = split_paras(text)
mit_content = defaultdict(list)
for para in paras:
para = para.strip()
if len(para) < 1:
continue
if len(para.split()) <= 10 and ("Licens" in para or "licens" in para) and "Copyright" not in para:
mit_content["header"].append(para)
elif "Copyright" in para:
if "is hereby granted" in para:
mit_content["copyright+content"].append(para)
else:
mit_content["copyright"].append(para)
elif "Permission is hereby granted" in para:
mit_content["content"].append(para)
elif "The above copyright notice" in para or len(para.split()) < 18:
mit_content["sentence"].append(para)
elif get_all_caps(para)[1]:
mit_content["all_cap"].append(para)
else:
mit_content["content"].append(para)
for key, value in mit_content.items():
mit_content[key] = "\n\n".join(value)
return mit_content
def get_most_likely_license_type(text):
"""
Returns the most likely license type based on Doc2Vec scores
(similarity > 0.9).
Parameters
----------
text : str
Raw License text.
Returns
-------
str
The type of the most likely license. "Not found" if no license score is
above 0.9
"""
try:
from src.doc2vec import inference
except:
from doc2vec import inference
top1_result = inference(text).loc[0, :]
if top1_result["Scores"] > 0.9:
return top1_result["License"]
else:
return "Not Found"
def clean_license_text(text, remove_exceptions=False, verbosity=0):
"""
Cleans License text.
Parameters
----------
text : str
Raw License text.
verbosity : int, optional
The level of print statements on the output console. The default is 0.
Returns
-------
text : str
Cleaned License text.
definitions : str
Definitions extracted from the License text.
"""
if len(text) == 0:
return text
most_likely_license_type = get_most_likely_license_type(text)
text, author_details = extract_author_details(text, verbosity=verbosity)
text = script_cleaner(text)
text = preprocess_text(text)
paras, definitions, exceptions = split_definitions_exceptions(
text, remove_exceptions, verbosity=verbosity
)
text = PARA_BREAK.join(paras)
text = character_cleaner(text)
text = re.sub(PARA_BREAK, "\n\n", text)
text = text.strip()
# TODO Need to update this too:
if not isEnglish(text):
if not isEnglish(" ".join(text.split()[-5:-1])):
return "", ""
if "MIT" in most_likely_license_type:
mit_content = get_MIT_content(text)
if verbosity != 0:
print("This is likely an MIT License!")
print(mit_content)
return text, definitions
"""
Notes:
1. Try splitting each para by "\n", if len == 1 and len(para) < 100 (or something)
-> Merge with the next para
Ex. "8. Termination."
"""