Spaces:
Runtime error
Runtime error
import re | |
import json | |
from bs4 import BeautifulSoup | |
from striprtf.striprtf import rtf_to_text | |
PARA_BREAK = "para___break" | |
def php_cleaner(text): | |
try: | |
return re.findall("\/\*[\S\s]*?\*\/", text)[0] | |
except: | |
return "" | |
# return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0] | |
def html_cleaner(text): | |
soup = BeautifulSoup(text) | |
text = soup.body.text | |
if not text: | |
return "" | |
return text | |
def json_cleaner(text_dict): | |
out = "" | |
for key in text_dict.keys(): | |
if key in ("description", "license"): | |
out += key | |
out += ": " | |
out += str(text_dict[key]) | |
out += ", " | |
return out | |
def discard_text_after_tnc(text): | |
return text.split("END OF TERMS AND CONDITIONS")[0] | |
def gnu_cleaner(text): | |
t = text.split('END OF TERMS AND CONDITIONS')[0] | |
definitions = "" | |
if 'Preamble' in text: | |
if len(t.split('Preamble')[0])>100: | |
t0 = t.split('Preamble')[0] | |
try: | |
t1 = t.split('Preamble')[1].split('distribution and\nmodification follow')[1] | |
except: | |
try: | |
t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1] | |
except: | |
t1 = t.split('Preamble')[1].split('distribution and modification follow')[1] | |
t = t0+t1 | |
else: | |
t = t.split('Preamble')[1].split('distribution and\nmodification follow')[1] | |
if 'Definitions' in text: | |
try: | |
def_pos = re.search(r"[0-9]\.? (Additional )?Definitions",t).span() | |
other_start_pos = re.search(r"[0-9]\.? [A-Z][a-z]+",t[def_pos[1]:]).span()[0] | |
definitions = t[def_pos[0]: def_pos[1] + other_start_pos] | |
t = t[:def_pos[0]] + t[def_pos[1]+other_start_pos:] | |
except: | |
t = t | |
return t, definitions | |
def rtf_cleaner(text): | |
return rtf_to_text(text) | |
def url_cleaner(text): | |
return re.sub(r"http\S+", "", text) | |
def email_cleaner(text): | |
return re.sub(r"\S*@\S*", "", text) | |
def var_cleaner(text): | |
text = re.sub(r"\$\w+", "", text) | |
text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text) | |
return text | |
def character_cleaner(text): | |
text = url_cleaner(text) | |
text = email_cleaner(text) | |
text = var_cleaner(text) | |
text = re.sub("[\n]{2,}", ". ", text) | |
text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text) | |
text = re.sub("[\. ]{2,}", ". ", text) | |
return text | |
def isEnglish(s): | |
try: | |
s.encode(encoding="utf-8").decode("ascii") | |
except UnicodeDecodeError: | |
return False | |
else: | |
return True | |
def preprocess_text(text): | |
definitions = "" | |
if "GNU" in text or "Apache" in text: | |
text, definitions = gnu_cleaner(text) | |
definitions = definitions.strip() | |
return text, definitions | |
def script_cleaner(text): | |
if "<?php" in text: | |
text = php_cleaner(text) | |
elif "</html>" in text: | |
text = html_cleaner(text) | |
elif text[0] == "{" and text[-1] == "}": | |
text = json_cleaner(json.loads(text)) | |
elif "\\rtf" in text: | |
text = rtf_cleaner(text) | |
if not text: | |
return "" | |
return text | |
def split_paras(text): | |
if "\n\n\n\n" in text: | |
paras = text.split("\n\n\n\n") | |
elif "\n\n\n" in text: | |
paras = text.split("\n\n\n") | |
elif "\n\n" in text: | |
paras = text.split("\n\n") | |
else: | |
paras = [text] | |
return paras | |
def clean_paras(paras): | |
return paras | |
def clean_license_text(text): | |
if len(text) == 0: | |
return text | |
text = script_cleaner(text) | |
text, definitions = preprocess_text(text) | |
paras = clean_paras(split_paras(text)) | |
text = PARA_BREAK.join(paras) | |
text = character_cleaner(text) | |
text = re.sub(PARA_BREAK, "\n\n", text) | |
text = text.strip() | |
if not isEnglish(text): | |
if not isEnglish(" ".join(text.split()[-5:-1])): | |
return "", "" | |
return text, definitions | |
""" | |
Notes: | |
1. Regex for other definitions: --------> ".{0,20}".{0,40}means | |
2. Try splitting each para by "\n", if len == 1 and len(para) < 100 (or something) | |
-> Merge with the next para | |
Ex. "8. Termination." | |
""" |