import re import json from bs4 import BeautifulSoup from striprtf.striprtf import rtf_to_text PARA_BREAK = "para___break" def php_cleaner(text): try: return re.findall("\/\*[\S\s]*?\*\/", text)[0] except: return "" # return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0] def html_cleaner(text): soup = BeautifulSoup(text) text = soup.body.text if not text: return "" return text def json_cleaner(text_dict): out = "" for key in text_dict.keys(): if key in ("description", "license"): out += key out += ": " out += str(text_dict[key]) out += ", " return out def discard_text_after_tnc(text): return text.split("END OF TERMS AND CONDITIONS")[0] def gnu_cleaner(text): t = text.split('END OF TERMS AND CONDITIONS')[0] definitions = "" if 'Preamble' in text: if len(t.split('Preamble')[0])>100: t0 = t.split('Preamble')[0] try: t1 = t.split('Preamble')[1].split('distribution and\nmodification follow')[1] except: try: t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1] except: t1 = t.split('Preamble')[1].split('distribution and modification follow')[1] t = t0+t1 else: t = t.split('Preamble')[1].split('distribution and\nmodification follow')[1] if 'Definitions' in text: try: def_pos = re.search(r"[0-9]\.? (Additional )?Definitions",t).span() other_start_pos = re.search(r"[0-9]\.? [A-Z][a-z]+",t[def_pos[1]:]).span()[0] definitions = t[def_pos[0]: def_pos[1] + other_start_pos] t = t[:def_pos[0]] + t[def_pos[1]+other_start_pos:] except: t = t return t, definitions def rtf_cleaner(text): return rtf_to_text(text) def url_cleaner(text): return re.sub(r"http\S+", "", text) def email_cleaner(text): return re.sub(r"\S*@\S*", "", text) def var_cleaner(text): text = re.sub(r"\$\w+", "", text) text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text) return text def character_cleaner(text): text = url_cleaner(text) text = email_cleaner(text) text = var_cleaner(text) text = re.sub("[\n]{2,}", ". ", text) text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text) text = re.sub("[\. ]{2,}", ". ", text) return text def isEnglish(s): try: s.encode(encoding="utf-8").decode("ascii") except UnicodeDecodeError: return False else: return True def preprocess_text(text): definitions = "" if "GNU" in text or "Apache" in text: text, definitions = gnu_cleaner(text) definitions = definitions.strip() return text, definitions def script_cleaner(text): if "" in text: text = html_cleaner(text) elif text[0] == "{" and text[-1] == "}": text = json_cleaner(json.loads(text)) elif "\\rtf" in text: text = rtf_cleaner(text) if not text: return "" return text def split_paras(text): if "\n\n\n\n" in text: paras = text.split("\n\n\n\n") elif "\n\n\n" in text: paras = text.split("\n\n\n") elif "\n\n" in text: paras = text.split("\n\n") else: paras = [text] return paras def clean_paras(paras): return paras def clean_license_text(text): if len(text) == 0: return text text = script_cleaner(text) text, definitions = preprocess_text(text) paras = clean_paras(split_paras(text)) text = PARA_BREAK.join(paras) text = character_cleaner(text) text = re.sub(PARA_BREAK, "\n\n", text) text = text.strip() if not isEnglish(text): if not isEnglish(" ".join(text.split()[-5:-1])): return "", "" return text, definitions """ Notes: 1. Regex for other definitions: --------> ".{0,20}".{0,40}means 2. Try splitting each para by "\n", if len == 1 and len(para) < 100 (or something) -> Merge with the next para Ex. "8. Termination." """