Spaces:

nihaldsouza1
/

clearlydefined_license_summarizer

Runtime error

clearlydefined_license_summarizer / src /clean.py

Nihal D'Souza

Updating clean function

95fa18f about 3 years ago

18.9 kB

	import re
	import json
	from bs4 import BeautifulSoup
	from striprtf.striprtf import rtf_to_text
	from collections import defaultdict


	PARA_BREAK = "para___break"
	seperator = "=" * 50
	verbosity = 0


	def extract_author_details(text, verbosity=0):
	"""
	Extracts important author information from the license text.

	Parameters
	----------
	text : str
	Raw License text.
	verbosity : int, optional
	The level of print statements on the output console. The default is 0.

	Returns
	-------
	text : str
	License text with author details removed.
	author_details : list
	A list of important author details.

	"""
	author_details_pattern = r"(@(author\|license\|copyright\|package).*)"
	author_details = list()
	text = re.sub(author_details_pattern, lambda m: author_details.append(m.group(1)), text)
	if author_details and verbosity != 0:
	print(seperator)
	print(seperator)
	print("Following author details were extracted:")
	print(seperator)
	print(author_details)
	print()

	return text, author_details


	def php_cleaner(text):
	"""
	Cleans the license file in PHP format.

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	str
	Cleaned License text with PHP script removed.

	"""
	try:
	return re.findall("\/\[\S\s]?\*\/", text)[0]
	except:
	return ""
	# return re.findall(r"(?<=<\?php\\n\\n\/\\\\n \).(?=\\n \*\/)", text)[0]


	def html_cleaner(text):
	"""
	Cleans the license file in HTML format.

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	str
	Cleaned License text with HTML script removed.

	"""
	soup = BeautifulSoup(text)
	text = soup.body.text
	if not text:
	return ""
	return text


	def json_cleaner(text_dict):
	"""
	Cleans the license file in JSON format.

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	str
	Cleaned License text with JSON format normalized to text.

	"""
	out = ""
	for key in text_dict.keys():
	if key in ("description", "license"):
	out += key
	out += ": "
	out += str(text_dict[key])
	out += ", "
	return out


	def rtf_cleaner(text):
	"""
	Cleans the license file in RTF format.

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	str
	Cleaned License text with RTF script removed.

	"""
	return rtf_to_text(text)


	def url_cleaner(text):
	"""
	Removes URLs from the License text.

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	str
	Cleaned License text with URLs removed.

	"""
	return re.sub(r"$?http\S+$?", "", text)


	def email_cleaner(text):
	"""
	Removes emails from the License text.

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	str
	Cleaned License text with emails removed.

	"""
	return re.sub(r"\S{3,}@\S{2,}\.\S+", "", text)


	def var_cleaner(text):
	"""
	Removes potential variable names from the License text.

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	str
	Cleaned License text with variable names removed.

	"""
	text = re.sub(r"\$\w+", "", text)
	text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text)
	# text = re.sub(r"[a-zA-Z_'\"]+\.[a-zA-Z_]+", "", text)
	return text


	def character_cleaner(text):
	"""
	Removes unnecessary special characters from the License text.

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	str
	Cleaned License text with some special characters removed.

	"""
	text = url_cleaner(text)
	text = email_cleaner(text)
	text = var_cleaner(text)

	text = re.sub("\s(;quot;\|&amp)\s", " ", text)
	text = re.sub("[\n]{2,}", ". ", text)
	text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text)
	text = re.sub("[\. ]{2,}", ". ", text)
	return text


	def isEnglish(s):
	"""
	Checks whether the License text is in English or not.

	Parameters
	----------
	s : str
	Raw License text.

	Returns
	-------
	bool
	True if complete License text is in English, False otherwise.

	"""
	try:
	s.encode(encoding="utf-8").decode("ascii")
	except UnicodeDecodeError:
	return False
	else:
	return True


	def split_definitions_exceptions(text, remove_exceptions, verbosity=0):
	"""
	Extract definitions from the License text

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	paras : list
	A list of paragraphs from License text with definitions and exceptions
	removed.
	definitions : str
	Definitions extracted from the License text.
	exceptions : list
	A list of paragraphs which contain exceptions .

	"""
	definitions = ""

	if "Definitions" in text:
	try:
	def_pattern = r"([S\|s]ection )?[0-9] ?[\.\|-\|–]? ?([A\|a]dditional )?[D\|d]efinitions"
	after_def_pattern = r"\s+(Section )?[0-9]\.? [\.\|-\|–]? ?[A-Z][a-z]+"
	def_pos = re.search(def_pattern, text).span()
	other_start_pos = re.search(after_def_pattern, text[def_pos[1]:]).span()[0]
	definitions = text[def_pos[0]: def_pos[1] + other_start_pos].strip() + "\n\n"
	text = text[:def_pos[0]] + text[def_pos[1] + other_start_pos:]
	except:
	pass

	paras, more_defs = extract_relevant_paras(
	split_paras(text, verbosity=verbosity),
	verbosity=verbosity
	)

	definitions += more_defs.strip()
	paras, exceptions = get_exeptions(paras, remove_exceptions, verbosity=verbosity)

	return paras, definitions, exceptions


	def discard_text_after_end_tnc(text):
	"""
	Discards text after "END OF TERMS AND CONDITIONS"

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	str
	License text with irrelavant information after "END OF TERMS AND CONDITIONS" removed.

	"""
	return text.split("END OF TERMS AND CONDITIONS")[0]


	def clear_preamble(text):
	"""
	Cleans Preamble from the License text

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	text : str
	License text with Preamble removed.

	"""
	preamble_pattern = "Preamble"
	dist_and_mod_pattern = "distribution\s+and\s+modification\s+follow\.?"

	if preamble_pattern in text:
	preamble_split = text.split(preamble_pattern)

	if len(preamble_split) != 2:
	return text

	try:
	after_preamble_end = re.split(dist_and_mod_pattern, preamble_split[1])[1]

	# TODO Why do we need this condition?
	if len(preamble_split[0]) > 100:
	text = preamble_split[0] + after_preamble_end.strip()
	except:
	pass
	return text


	def gnu_cleaner(text):
	"""
	Cleans GNU text such as discarding Preamble and text after end of terms
	and conditions.

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	preamble_cleared_text : str
	License text with irrelavant information in Preamble and text after end
	of terms and conditions removed.

	"""

	before_end_tnc = discard_text_after_end_tnc(text)
	preamble_cleared_text = clear_preamble(before_end_tnc)

	return preamble_cleared_text


	def preprocess_text(text):
	"""
	Preprocesses License text considering different License types.

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	text : str
	Cleaned License text.

	"""

	# if most_likely_license_type in [
	# "GPL-3.0-only",
	# "AGPL-3.0-only",
	# "GPL-2.0-only",
	# "LGPL-3.0-only",
	# "LGPL-2.1-only",
	# ]:

	# # We need to take care of these cases too:
	# # https://choosealicense.com/licenses/ofl-1.1/
	# # https://choosealicense.com/licenses/lodbl-1.0/
	# # https://choosealicense.com/licenses/odbl-1.0/
	# # https://choosealicense.com/licenses/lms-rl/
	# # https://choosealicense.com/licenses/lms-pl/
	# # https://choosealicense.com/licenses/lmpl-2.0/
	# # https://choosealicense.com/licenses/lppl-1.3c/
	# # https://choosealicense.com/licenses/eupl-1.2/
	# # https://choosealicense.com/licenses/eupl-1.1/
	# # https://choosealicense.com/licenses/epl-2.0/
	# # https://choosealicense.com/licenses/epl-1.0/
	# # https://choosealicense.com/licenses/ecl-2.0/
	# # https://choosealicense.com/licenses/cecill-2.1/
	# # https://choosealicense.com/licenses/cc-by-sa-4.0/
	# # https://choosealicense.com/licenses/cc-by-4.0/
	# # https://choosealicense.com/licenses/artistic-2.0/
	# # https://choosealicense.com/licenses/apache-2.0/

	# TODO This condition will not work, fix it:
	if "GNU" in text or "Apache" in text:
	text = gnu_cleaner(text)
	return text


	def clean_if_else(text):
	"""
	Removes specific if-else conditions from the License text

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	str
	Cleaned License text with if-else conditions removed.

	"""
	return re.sub(r"#\bif[\s\S]+?#endif\s*", "", text).strip()


	def clean_comments(text):
	"""
	Cleans specific comment formats from the License texts

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	str
	Cleaned License text with comments conditions removed.

	"""
	return re.sub(r"[\`'\"]{3,}[\s\S]*?[\`'\"]{3,}", "", text).strip()


	def script_cleaner(text):
	"""
	Cleans the script text from License text to extract the main content.

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	str
	Cleaned License text without scripts.

	"""
	if "<?php" in text:
	text = php_cleaner(text)
	elif "</html>" in text:
	text = html_cleaner(text)
	elif text[0] == "{" and text[-1] == "}":
	text = json_cleaner(json.loads(text))
	elif "\\rtf" in text:
	text = rtf_cleaner(text)
	if not text:
	return ""
	text = clean_if_else(text)
	text = clean_comments(text)
	return text


	def split_paras(text, verbosity=0):
	"""
	Splits the text into paragraphs.

	Parameters
	----------
	text : str
	Raw License text.
	verbosity : int, optional
	The level of print statements on the output console. The default is 0.

	Returns
	-------
	paras : list
	A list of split paragraphs.

	"""
	text = re.sub(r"\n{4,}", "\n"*4, text)
	if len(re.findall("\n\n\n\n", text)) >= 2:
	paras = text.split("\n\n\n\n")
	paras = [re.sub(r"\n{1,3}", " ", para) for para in paras]
	elif len(re.findall("\n\n", text)) >= 2:
	paras = text.split("\n\n")
	paras = [re.sub(r"\n", " ", para) for para in paras]
	elif len(re.findall("\n", text)) >= 2:
	paras = text.split("\n")
	else:
	paras = [text]
	if verbosity != 0:
	print(seperator)
	print(seperator)
	print("These are the split paras in the text:")
	for para in paras:
	if not para.strip():
	continue
	print(seperator)
	print(para)
	print()
	return paras


	def extract_relevant_paras(paras, verbosity=0):
	"""
	Extracts relevant paragraphs from the list of all paragraphs.

	Parameters
	----------
	paras : list
	A list of split paragraphs.
	verbosity : int, optional
	The level of print statements on the output console. The default is 0.

	Returns
	-------
	cleaned_paras : list
	A list of relevant paragraphs.
	definitions : str
	Definition text as extracted by the "clean_definitions_pattern", which
	is to be appended to other definitons in the License text if any.

	"""
	cleaned_paras = list()
	definitions = ""

	# TODO This might be interesting to look into:
	# https://choosealicense.com/licenses/eupl-1.2/

	clean_definitions_pattern = r"""\".{0,20}\".{0,40}(mean\|include\|refer)s?"""

	if verbosity != 0:
	print(seperator)
	print(seperator)
	print("Following paragraphs were considered unnecessary and removed:")
	for para in paras:
	if not para.strip():
	continue
	if re.search(clean_definitions_pattern, para):
	definitions += para + "\n\n"
	if verbosity != 0:
	print(seperator)
	print(para)
	else:
	cleaned_paras.append(para)
	if verbosity != 0:
	print()

	definitions = definitions.strip()

	return cleaned_paras, definitions


	def get_all_caps(text, verbosity=0):
	"""
	Extracts text with all caps content from the License text.

	Parameters
	----------
	text : str
	Raw License text.
	verbosity : int, optional
	The level of print statements on the output console. The default is 0.

	Returns
	-------
	text : str
	License text with all caps sentences removed.
	all_caps : list
	A list of all caps sentences from the License text.

	"""
	all_caps_pattern = r"([^a-z\n]{50,})"
	all_caps = list()
	text = re.sub(all_caps_pattern, lambda m: all_caps.append(m.group(1)), text)
	text = re.sub(r"\n{3,}", "\n\n", text)
	if all_caps and verbosity != 0:
	print(seperator)
	print(seperator)
	print("Following all caps were removed from the text:")
	print(all_caps)
	print()
	return text, all_caps


	def get_exeptions(paras, remove_exceptions, verbosity=0):
	"""
	Extracts a list of exceptions from the License text.

	Parameters
	----------
	paras : list
	A list of paragraphs from the License text.
	verbosity : int, optional
	The level of print statements on the output console. The default is 0.

	Returns
	-------
	non_exception_paras : list
	A list of all paragraphs not containing exceptions from the License text.
	exceptions : list
	A list of all paragraphs containing exceptions from the License text.

	"""
	non_exception_paras = list()
	exceptions = list()

	for para in paras:
	if re.search("exception", para.lower()):
	exceptions.append(para)
	if not remove_exceptions:
	non_exception_paras.append(para)
	else:
	non_exception_paras.append(para)

	if exceptions and verbosity != 0:
	print(seperator)
	print(seperator)
	print("Following exceptions were found in the text:")
	for exception in exceptions:
	print(seperator)
	print(exception)
	print()

	return non_exception_paras, exceptions


	def get_MIT_content(text):
	"""
	Returns the content of the MIT-like-licenses segregated into categories like
	Copyright, main content, etc.

	Parameters
	----------
	text : str
	Cleaned MIT License text.

	Returns
	-------
	dictionary
	A dictionary of content from the MIT license. Keys are the type of
	content and values are the License contents from License text.
	"""
	paras = split_paras(text)

	mit_content = defaultdict(list)

	for para in paras:
	para = para.strip()
	if len(para) < 1:
	continue
	if len(para.split()) <= 10 and ("Licens" in para or "licens" in para) and "Copyright" not in para:
	mit_content["header"].append(para)
	elif "Copyright" in para:
	if "is hereby granted" in para:
	mit_content["copyright+content"].append(para)
	else:
	mit_content["copyright"].append(para)
	elif "Permission is hereby granted" in para:
	mit_content["content"].append(para)
	elif "The above copyright notice" in para or len(para.split()) < 18:
	mit_content["sentence"].append(para)
	elif get_all_caps(para)[1]:
	mit_content["all_cap"].append(para)
	else:
	mit_content["content"].append(para)

	for key, value in mit_content.items():
	mit_content[key] = "\n\n".join(value)

	return mit_content


	def get_most_likely_license_type(text):
	"""
	Returns the most likely license type based on Doc2Vec scores
	(similarity > 0.9).

	Parameters
	----------
	text : str
	Raw License text.

	Returns
	-------
	str
	The type of the most likely license. "Not found" if no license score is
	above 0.9
	"""

	try:
	from src.doc2vec import inference
	except:
	from doc2vec import inference

	top1_result = inference(text).loc[0, :]

	if top1_result["Scores"] > 0.9:
	return top1_result["License"]
	else:
	return "Not Found"


	def clean_license_text(text, remove_exceptions=False, verbosity=0):
	"""
	Cleans License text.

	Parameters
	----------
	text : str
	Raw License text.
	verbosity : int, optional
	The level of print statements on the output console. The default is 0.

	Returns
	-------
	text : str
	Cleaned License text.
	definitions : str
	Definitions extracted from the License text.

	"""

	if len(text) == 0:
	return text

	most_likely_license_type = get_most_likely_license_type(text)

	text, author_details = extract_author_details(text, verbosity=verbosity)
	text = script_cleaner(text)
	text = preprocess_text(text)
	paras, definitions, exceptions = split_definitions_exceptions(
	text, remove_exceptions, verbosity=verbosity
	)
	text = PARA_BREAK.join(paras)
	text = character_cleaner(text)
	text = re.sub(PARA_BREAK, "\n\n", text)
	text = text.strip()

	# TODO Need to update this too:
	if not isEnglish(text):
	if not isEnglish(" ".join(text.split()[-5:-1])):
	return "", ""

	if "MIT" in most_likely_license_type:
	mit_content = get_MIT_content(text)
	if verbosity != 0:
	print("This is likely an MIT License!")
	print(mit_content)

	return text, definitions

	"""
	Notes:

	1. Try splitting each para by "\n", if len == 1 and len(para) < 100 (or something)
	-> Merge with the next para
	Ex. "8. Termination."
	"""