Spaces:

nihaldsouza1
/

clearlydefined_license_summarizer

Runtime error

clearlydefined_license_summarizer / src /clean.py

Nihal D'Souza

Custom textrank, changes to UI

a804ced about 3 years ago

4.26 kB

	import re
	import json
	from bs4 import BeautifulSoup
	from striprtf.striprtf import rtf_to_text


	PARA_BREAK = "para___break"


	def php_cleaner(text):
	try:
	return re.findall("\/\[\S\s]?\*\/", text)[0]
	except:
	return ""
	# return re.findall(r"(?<=<\?php\\n\\n\/\\\\n \).(?=\\n \*\/)", text)[0]


	def html_cleaner(text):
	soup = BeautifulSoup(text)
	text = soup.body.text
	if not text:
	return ""
	return text


	def json_cleaner(text_dict):
	out = ""
	for key in text_dict.keys():
	if key in ("description", "license"):
	out += key
	out += ": "
	out += str(text_dict[key])
	out += ", "
	return out


	def discard_text_after_tnc(text):
	return text.split("END OF TERMS AND CONDITIONS")[0]


	def gnu_cleaner(text):
	t = text.split('END OF TERMS AND CONDITIONS')[0]
	definitions = ""
	if 'Preamble' in text:
	if len(t.split('Preamble')[0])>100:
	t0 = t.split('Preamble')[0]
	try:
	t1 = t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
	except:
	try:
	t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1]
	except:
	t1 = t.split('Preamble')[1].split('distribution and modification follow')[1]
	t = t0+t1
	else:
	t = t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
	if 'Definitions' in text:
	try:
	def_pos = re.search(r"[0-9]\.? (Additional )?Definitions",t).span()
	other_start_pos = re.search(r"[0-9]\.? [A-Z][a-z]+",t[def_pos[1]:]).span()[0]
	definitions = t[def_pos[0]: def_pos[1] + other_start_pos]
	t = t[:def_pos[0]] + t[def_pos[1]+other_start_pos:]
	except:
	t = t
	return t, definitions


	def rtf_cleaner(text):
	return rtf_to_text(text)


	def url_cleaner(text):
	return re.sub(r"http\S+", "", text)


	def email_cleaner(text):
	return re.sub(r"\S@\S", "", text)


	def var_cleaner(text):
	text = re.sub(r"\$\w+", "", text)
	text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text)
	return text


	def character_cleaner(text):
	text = url_cleaner(text)
	text = email_cleaner(text)
	text = var_cleaner(text)

	text = re.sub("[\n]{2,}", ". ", text)
	text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text)
	text = re.sub("[\. ]{2,}", ". ", text)
	return text


	def isEnglish(s):
	try:
	s.encode(encoding="utf-8").decode("ascii")
	except UnicodeDecodeError:
	return False
	else:
	return True


	def preprocess_text(text):
	definitions = ""
	if "GNU" in text or "Apache" in text:
	text, definitions = gnu_cleaner(text)
	definitions = definitions.strip()
	return text, definitions


	def script_cleaner(text):
	if "<?php" in text:
	text = php_cleaner(text)
	elif "</html>" in text:
	text = html_cleaner(text)
	elif text[0] == "{" and text[-1] == "}":
	text = json_cleaner(json.loads(text))
	elif "\\rtf" in text:
	text = rtf_cleaner(text)
	if not text:
	return ""
	return text


	def split_paras(text):
	if "\n\n\n\n" in text:
	paras = text.split("\n\n\n\n")
	elif "\n\n\n" in text:
	paras = text.split("\n\n\n")
	elif "\n\n" in text:
	paras = text.split("\n\n")
	else:
	paras = [text]
	return paras


	def clean_paras(paras):
	return paras


	def clean_license_text(text):

	if len(text) == 0:
	return text

	text = script_cleaner(text)
	text, definitions = preprocess_text(text)
	paras = clean_paras(split_paras(text))
	text = PARA_BREAK.join(paras)
	text = character_cleaner(text)
	text = re.sub(PARA_BREAK, "\n\n", text)
	text = text.strip()

	if not isEnglish(text):
	if not isEnglish(" ".join(text.split()[-5:-1])):
	return "", ""

	return text, definitions


	"""
	Notes:

	1. Regex for other definitions: --------> ".{0,20}".{0,40}means
	2. Try splitting each para by "\n", if len == 1 and len(para) < 100 (or something)
	-> Merge with the next para
	Ex. "8. Termination."
	"""