Nihal D'Souza
Custom textrank, changes to UI
a804ced
raw
history blame
4.26 kB
import re
import json
from bs4 import BeautifulSoup
from striprtf.striprtf import rtf_to_text
PARA_BREAK = "para___break"
def php_cleaner(text):
try:
return re.findall("\/\*[\S\s]*?\*\/", text)[0]
except:
return ""
# return re.findall(r"(?<=<\?php\\n\\n\/\*\*\\n \*).*(?=\\n \*\/)", text)[0]
def html_cleaner(text):
soup = BeautifulSoup(text)
text = soup.body.text
if not text:
return ""
return text
def json_cleaner(text_dict):
out = ""
for key in text_dict.keys():
if key in ("description", "license"):
out += key
out += ": "
out += str(text_dict[key])
out += ", "
return out
def discard_text_after_tnc(text):
return text.split("END OF TERMS AND CONDITIONS")[0]
def gnu_cleaner(text):
t = text.split('END OF TERMS AND CONDITIONS')[0]
definitions = ""
if 'Preamble' in text:
if len(t.split('Preamble')[0])>100:
t0 = t.split('Preamble')[0]
try:
t1 = t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
except:
try:
t1 = t.split('Preamble')[1].split('distribution and\n\nmodification follow')[1]
except:
t1 = t.split('Preamble')[1].split('distribution and modification follow')[1]
t = t0+t1
else:
t = t.split('Preamble')[1].split('distribution and\nmodification follow')[1]
if 'Definitions' in text:
try:
def_pos = re.search(r"[0-9]\.? (Additional )?Definitions",t).span()
other_start_pos = re.search(r"[0-9]\.? [A-Z][a-z]+",t[def_pos[1]:]).span()[0]
definitions = t[def_pos[0]: def_pos[1] + other_start_pos]
t = t[:def_pos[0]] + t[def_pos[1]+other_start_pos:]
except:
t = t
return t, definitions
def rtf_cleaner(text):
return rtf_to_text(text)
def url_cleaner(text):
return re.sub(r"http\S+", "", text)
def email_cleaner(text):
return re.sub(r"\S*@\S*", "", text)
def var_cleaner(text):
text = re.sub(r"\$\w+", "", text)
text = re.sub(r"{[{}()\w\s._,\[\]'\"]+}", "", text)
return text
def character_cleaner(text):
text = url_cleaner(text)
text = email_cleaner(text)
text = var_cleaner(text)
text = re.sub("[\n]{2,}", ". ", text)
text = re.sub("[:%#<>=*\-/·\s{}]+", " ", text)
text = re.sub("[\. ]{2,}", ". ", text)
return text
def isEnglish(s):
try:
s.encode(encoding="utf-8").decode("ascii")
except UnicodeDecodeError:
return False
else:
return True
def preprocess_text(text):
definitions = ""
if "GNU" in text or "Apache" in text:
text, definitions = gnu_cleaner(text)
definitions = definitions.strip()
return text, definitions
def script_cleaner(text):
if "<?php" in text:
text = php_cleaner(text)
elif "</html>" in text:
text = html_cleaner(text)
elif text[0] == "{" and text[-1] == "}":
text = json_cleaner(json.loads(text))
elif "\\rtf" in text:
text = rtf_cleaner(text)
if not text:
return ""
return text
def split_paras(text):
if "\n\n\n\n" in text:
paras = text.split("\n\n\n\n")
elif "\n\n\n" in text:
paras = text.split("\n\n\n")
elif "\n\n" in text:
paras = text.split("\n\n")
else:
paras = [text]
return paras
def clean_paras(paras):
return paras
def clean_license_text(text):
if len(text) == 0:
return text
text = script_cleaner(text)
text, definitions = preprocess_text(text)
paras = clean_paras(split_paras(text))
text = PARA_BREAK.join(paras)
text = character_cleaner(text)
text = re.sub(PARA_BREAK, "\n\n", text)
text = text.strip()
if not isEnglish(text):
if not isEnglish(" ".join(text.split()[-5:-1])):
return "", ""
return text, definitions
"""
Notes:
1. Regex for other definitions: --------> ".{0,20}".{0,40}means
2. Try splitting each para by "\n", if len == 1 and len(para) < 100 (or something)
-> Merge with the next para
Ex. "8. Termination."
"""